/*==================================================================
Project: 	INCREASING THE DEMAND FOR WORKERS WITH A CRIMINAL RECORD
Info: 		Creates all tables for paper.
Program: 	0_analysis_data.do
Created: 	July 2021
Edited:  	July 12, 2022
Purpose: 	Creates analysis data by cleaning and merging raw data.
Note: 		Set the working directory to the Replication folder that contains this do file. Some code has been omitted or censored.
==================================================================*/
cap log close
clear all  
version 16, permanently 
discard 

***** Set paths.
global Main "`c(pwd)'"

* Input Data directory
global input_data "$Main/input_data"

* Output Data directory
global output_data "$Main/analysis_data"

set seed 89449
set sortseed 12345

***** Install packages
ssc install statastates, replace
ssc install gtools, replace
ssc install egenmore, replace
ssc install ipfweight, replace
net install strcompress, replace from(https://raw.github.com/lukestein/strcompress/master/) 
ssc install strdist, replace
ssc install winsor2, replace
ssc install opencagegeo, replace
ssc install insheetjson, replace

/*==================================================================
*** RUN A LOG FILE ***
==================================================================*/
global date = "$S_DATE"
global LogPath = "$Main/log"
capture log close
log using "$LogPath/WCC_$date.log", append

/*==================================================================
*** LOAD PROGRAMS ***
==================================================================*/

***** Programs to Import and Clean Survey Data *****
cap program drop import_survey_excel
program import_survey_excel

syntax using/, saveas(string) [replace] [fall]

if !missing("`replace'") local replace ", replace"

import excel "`using'",  sheet("Sheet0") firstrow clear 
* this created labels with the actual questions

foreach v of varlist _all{
	local raw_label = `v'[1]

	tokenize `: subinstr local raw_label "," ";" , all ', parse("?")
	if `"`3'"' != "" {
		if strpos(`"`1'"',`"the following statement"')>0 & strpos(`"`3'"',"(") == 0 {
			// need to get after the comma to see the meaningful part 
			tokenize `3', parse(";")
			label variable `v' "`3'?" 
		} 
		else label variable `v' "`1'?" 
	}
	else {
		tokenize `: subinstr local raw_label "," ";" , all ', parse("-")
		//find examples 
		local index 1 

		while `"``index''"' != ""{
			if strpos(`"``index''"',"(" ) local first `index'		
			local index = `index' + 1	
		}

		if `"``first''"' != "" { 
			label variable `v' `"``first''"'
		} // if it's not one of the criminal record lists
		else label variable `v' "`1'"
	}

	note `v' : "`raw_label'"
}


drop in 1

drop if inlist(Status, "Survey Preview", "Survey Test")

gen duration = real(Durationinseconds)

* drop question meta-data (for now)
drop *Click* *Submit*
drop q11 // click here to confirm 


if !missing("`fall'") {
	* Fill in answers when we have subsidy = 0 and we don't mention the null subsidy 
	* Baseline demand 
	fill_in Q87, with(Q1125)

	* insurance demand 
	fill_in Q107, with(Q1122)

	* job history 
	fill_in Q110, with(Q1126)

	* clean_record_interested
	fill_in Q1093, with(Q1128)

	* unemployment_interested
	fill_in Q1103, with(Q1130)

	* Crimetype
	forvalues i = 1/6 {
		fill_in Q139_`i', with(Q1131_`i')
	}

	*performance_percent_post
	fill_in Q222_10, with(Q1134_10)

	* subsidy_interested_posterior
	fill_in Q219, with(Q1136)

	rename Q72 email 
	rename Q246 gift_type
	rename Q246_2_TEXT gift_email

	rename Q229 taken_before 

	keep if taken_before == "I have not already shared my answers to this survey and I would like to continue."
}
else {
	rename RecipientEmail email
	rename Q72 gift_type 
	rename Q72_1_TEXT receive_gift_text 
}

* Rename variables
rename RecipientFirstName FirstName
rename RecipientLastName LastName

rename Q61_* *
rename Q56 role_platform_more_info
rename Q200 questions_insurance
rename Q57 role_platform

rename Q87 subsidy_interested
rename Q219 subsidy_interested_posterior

rename Q107 insurance_interested
rename Q110 job_history_interested
rename Q1093 clean_record_interested
rename Q1103 unemployment_interested

rename Q139_1 prop_fin_felony
rename Q139_2 violent_felony
rename Q139_3 substance_felony
rename Q139_4 prop_fin_misdemeanor
rename Q139_5 violent_misdemeanor
rename Q139_6 substance_misdemeanor
rename Q228 how_others_hire

rename Q225_10 performance_percent_pre
rename Q222_10 performance_percent_post

rename Q192 perform_well 
rename Q194 put_others_at_risk
rename Q207 steal_cause_damage
rename Q146 referral_info
rename Q195 technical_issues
rename Q196 other_issues
rename Q145 additional_comments

rename Q173 contact_customers
rename Q172 cash_high_value

* Correct some variable types
destring LocationLongitude LocationLatitude performance_record performance_percent_pre performance_percent_post, replace force

destring years_since_last, gen(years_elapsed) ignore("years") force
destring subsidy_rate ue_rate, ignore("%") force replace

destring info_randomization, force replace

* Read in the date that they took the survey 
gen responsedate1 = clock(StartDate, "MDYhm")
drop if responsedate1 <=  clock("3/1/2020", "MDY")

gen response_date = dofc(responsedate1 )
format response_date %td
drop responsedate1

label variable response_date "Date Respondent Received Survey" 

* Drop emails of people who tested 
** [code omitted due to presence of PII]


* Make sure all responses from the test/beta rounds are censored
tempfile main test_emails
save `main', replace 
clear 
forvalues i = 1/4{
	import delimited "$input_data/test_`i'.csv", clear 
	if `i' > 1 append using `test_emails' 
	save `test_emails', replace 
}
keep email
save `test_emails', replace 
use `main', clear 

* Keep only emails that appear in the master, but do not match onto the test list 
drop if missing(email)

merge 1:1 email using `test_emails', keep(1) nogen 

* Remove Platform emails
gen lemail = ustrlower(email)
pause
drop if strpos(lemail, "platform") //omitted line of code due to 
pause
drop lemail

save "`saveas'" `replace'
end 

***** Program to fill in missing data *****
cap program drop fill_in
program fill_in 
syntax varname, with(varname)

local destination `varlist'

assert missing(`destination') if !missing(`with')
replace `destination' = `with' if missing(`destination')

end 

***** Program to clean .dta file *****
cap program drop clean_survey
program clean_survey
/*
INPUT: 
.dta file, output by import_survey_results.do 

OUTPUT 
.dta file, ready for analysis
*/

syntax using/ , saveas(string) [replace]

if !missing("`replace'") local replace ", replace"
use "`using'", clear 

* Identifiers
gen mgr_id = _n
label variable mgr_id "ID for an HR manager"

isid mgr_id, sort

* Duration
destring Durationinseconds, replace
gen minutes = Durationinseconds / 60


* TREATMENTS

* SUBSIDY RATE
label variable subsidy_rate "Subsidy Rate"
label define subsidy_rate 0 "No Subsidy" 5 "5\% Subsidy" 10 "10\% Subsidy$^\dagger$" 25 "25\% Subsidy" 50 "50\% Subsidy" 100 "100\% Subsidy"
recode subsidy_rate 5 = 10 
label values subsidy_rate subsidy_rate


* INSURANCE CAP (we aggregate $1000 and $5000)
gen ins_cap = .
replace ins_cap = 5*(10^3) if insurance_cap=="$1,000" // combined $1k and $5k for expository reasons
replace ins_cap = 5*(10^3) if insurance_cap=="$5,000"
replace ins_cap = (10^5) if insurance_cap=="$100,000"
replace ins_cap = (10^6) if insurance_cap=="$1 million"
replace ins_cap = 5*(10^6) if insurance_cap=="$5 million"

label variable ins_cap "Insurance Cap"

label define ins_levels 1000 "\\$1,000" 5000 "\\$5,000$^\dagger$" 100000 "\\$100,000" 1000000 "\\$1 Million"  5000000 "\\$5 Million"

label values ins_cap ins_levels

drop insurance_cap

* PAST JOBS
rename performance_record past_jobs

foreach yX in subsidy_rate ins_cap past_jobs years_elapsed {
	egen min_`yX' = min(`yX')
	label variable min_`yX' "Lowest level of `yX'"
}


* OUTCOMES

replace insurance_interested = subsidy_interested if subsidy_interested == "Yes"

replace clean_record_interested = subsidy_interested if subsidy_interested == "Yes"

replace job_history_interested = subsidy_interested if subsidy_interested == "Yes"

replace unemployment_interested = subsidy_interested if subsidy_interested == "Yes"

assert ~mi(subsidy_interested) if Finished == "True"
gen hire_sub = (subsidy_interested=="Yes") if !missing(subsidy_interested)
label variable hire_sub "Would hire under a subsidy"

gen hire_sub_alt = hire_sub
replace hire_sub_alt = 1 if (subsidy_interested=="Only if it's hard to fill my jobs") & !missing(subsidy_interested)
label variable hire_sub_alt "Would hire under a subsidy, including only if hard to fill"

* insurance_interested
gen hire_ins = (insurance_interested=="Yes") if ~mi(insurance_interested)
replace hire_ins = 1 if hire_sub ==1
label variable hire_ins "Would hire under insurance and subsidy"

* including "only if hard to fill"
gen hire_ins_alt = (insurance_interested=="Yes") if ~mi(insurance_interested)
replace hire_ins_alt = 1 if hire_sub_alt ==1 | (insurance_interested=="Only if it's hard to fill my jobs")
label variable hire_ins_alt "Would hire under insurance and subsidy, including only if hard to fill"

*  job_history_interested
gen hire_hist = (job_history_interested=="Yes") if ~mi(job_history_interested)
replace hire_hist = 1 if hire_sub ==1
label variable hire_hist "Would hire under past performance history"

gen hire_hist_alt = (job_history_interested=="Yes") if ~mi(job_history_interested)
replace hire_hist_alt = 1 if hire_sub_alt ==1 | (job_history_interested=="Only if it's hard to fill my jobs")
label variable hire_hist_alt "Would hire under past performance history, including only if hard to fill"

* unemployment
gen hire_unemp = (unemployment_interested == "Yes") if !missing(unemployment_interested)
replace hire_unemp = 1 if hire_sub == 1
label variable hire_unemp "Would hire with given unemployment rate"

gen hire_unemp_alt = (unemployment_interested=="Yes") if ~mi(unemployment_interested)
replace hire_unemp_alt = 1 if hire_sub_alt ==1 | (unemployment_interested=="Only if it's hard to fill my jobs")
label variable hire_unemp_alt "Would hire with given unemployment rate, including only if hard to fill"

* limited criminal history screening  
gen hire_clean = (clean_record_interested == "Yes") if !missing(clean_record_interested)
replace hire_clean = 1 if hire_sub == 1
label variable hire_clean "Would hire with given time elasped since last offense"

gen hire_clean_alt = (clean_record_interested=="Yes") if ~mi(clean_record_interested)
replace hire_clean_alt = 1 if hire_sub_alt ==1 | (clean_record_interested=="Only if it's hard to fill my jobs")
label variable hire_clean_alt "Would hire with given time elasped since last offense, including only if hard to fill"


* Process Likert Scale Vars, see helper program at bottom     
gen_likert perform_well
label var perform_well_num "Confidence a WCs will Perform Well"

gen_likert put_others_at_risk
label var put_others_at_risk_num "Concern a WCs will Put Others at Risk"

gen_likert steal_cause_damage
label var steal_cause_damage_num "Concern a WCs will Steal or Cause Damage"


* INFORMATION / UPDATING
gen hire_sub_posterior = subsidy_interested_posterior == "Yes" if !missing(subsidy_interested_posterior)
label variable hire_sub_posterior "Would hire with subsidy after information treatment"

* ENFORCE RANGE AROUND TRUTH
gen ln_prior = max(ln(43.5),ln(performance_percent_pre)) if info_type ==1 
replace ln_prior = min(ln(12),ln(performance_percent_pre)) if info_type ==2 

gen ln_posterior = max(ln(43.5),ln(performance_percent_post)) if info_type ==1 
replace ln_posterior = min(ln(12),ln(performance_percent_post)) if info_type ==2 

gen perception_update = ln_posterior - ln_prior
label variable perception_update "Difference between Posterior and Prior Beliefs"

gen perception_gap = ln(87) - ln_prior if info_type == 1 
replace perception_gap = ln(3) - ln_prior if info_type == 2

label variable perception_gap "Perception Gap (Signal - Prior Belief)"

* info type 1 is the share of WCC with a high score
* info type 2 is the share of WCC with a low score (i.e. not a high score)
* in this way an overestimate of info type 1 is (almost) the same as an underestimate of info type 2 

* i.e. t1 ~~ 100 - t2 
* truth_t1 - pre_t1 ~~ 100 - truth_t2 - 100 + pre_t2 = -(truth_t2 - pre_t2) 

gen norm_perception_update    = perception_update   if info_type == 1
gen norm_perception_gap     = perception_gap    if info_type == 1

replace norm_perception_update  = -perception_update  if info_type == 2
replace norm_perception_gap   = -perception_gap     if info_type == 2

label define info_rand 0 "Not Shown Information" 1 "Shown Information"
label values info_randomization info_rand


* GEOCODE INFO
drop street number 
rename confidence geocode_confidence
rename quality geocode_precision
label variable geocode_precision "How precise is the address we matched?"


* ADDITIONAL FIRM COVARIATES 
* From the distribution data, Platform admin, and Infogroup
* make more useful SIC codes 
cap: tostring sic, replace  
gen sic_2dig = substr(sic,1,2)
gen sic_1dig = substr(sic,1,1)

gen sic1 = "Agriculture, Forestry \& Fishing" if inlist(sic_2dig,"01","02","07","08","09")
replace sic1 = "Mining" if inlist(sic_2dig,"10","11","12","13","14")
replace sic1 = "Construction" if inlist(sic_2dig,"15","16","17")
replace sic1 = "Manufacturing" if inlist(sic_1dig,"2","3")
replace sic1 = "Transportation \& Public Utilities" if inlist(sic_1dig,"4")
replace sic1 = "Retail" if inlist(sic_1dig,"5")
replace sic1 = "Wholesale Trade" if inlist(sic_2dig,"50","51")
replace sic1 = "Finance, Insurance, \& Real Estate" if inlist(sic_1dig,"6")
replace sic1 = "Service" if inlist(sic_1dig,"7","8")
replace sic1 = "Public Administration" if inlist(sic_1dig,"9")
replace sic1 = "Nonclassifiable" if inlist(sic_2dig,"99")

replace industry = sic1 if missing(industry)
replace industry = "Service" if industry == "Services"
replace industry = "Retail" if industry == "Retail Trade"

* Modal job category 
rename modal_category modal_category_derived
gen modal_category = modal_category_admin

* Fill in missings, and see if we can do better than "Other"
replace modal_category = modal_category_derived if missing(modal_category) | modal_category == "Other"
replace modal_category = "Washing \& Cleaning" if modal_category == "Washing & Cleaning"

cap drop service back_office manufacturing
gen service = inlist(modal_category, "Delivery", "Event Staff", "Customer Service", "Washing & Cleaning") if !missing(modal_category)
gen back_office = inlist(modal_category,"Administrative","Software Testing","Brand Ambassador","General Labor") if !missing(modal_category)
gen manufacturing = inlist(modal_category, "Fulfillment / Warehousing", "Food Production", "Manufacturing/Production") if !missing(modal_category)

* This takes lots of weird values, let's make them nice
rename firm_id firm_id1
egen firm_id = group(firm_id1)
drop firm_id1

destring estab_year ,replace
gen estab_age = 2020 - estab_year

* Label these firm variables 
label variable estab_name "Hiring Account Name (as entered)"
label variable firm_id "Group together large firms with many accounts."
label var estab_name "Requestor Establishment (Raw)"
label var company "Requestor Establishment (Cleaned, for matching)"


* MAKE DUMMIES FOR SHRM SURVEY QUESTIONS
gen shrm_best_candidate = strpos(shrm_factors,"We want to work with the best candidate for the job regardless of criminal history") > 0 if !missing(shrm_factors)
gen shrm_second_chance = strpos(shrm_factors,"We want to give a second chance to individuals") > 0 if !missing(shrm_factors)
gen shrm_incentivized = strpos(shrm_factors,"We are incentivized by tax rebates or other government incentives") > 0 if !missing(shrm_factors)
gen shrm_customers = strpos(how_others_hire,"How customers would react to knowing that some employees have criminal records") > 0 if !missing(how_others_hire)
gen shrm_regulations = strpos(how_others_hire,"Local, state, or federal regulations that make it impossible or very difficult to hire individuals with criminal records") > 0 if !missing(how_others_hire)
gen shrm_performance = strpos(how_others_hire,"Individuals with criminal records won’t be effective employees") > 0 if !missing(how_others_hire)
gen byte wcc_policy = (shrm_policy == "Yes")  if !missing(shrm_policy)

* Tag incomplete responses
gen incomplete = Finished == "False"
rename state state_name 
rename state_abbrev state 

* Generate variables for answered questions
gen answered_all_demand = !missing(hire_sub_posterior)
gen answered_treatment_demand = !missing(substance_mis)
gen answered_first = !missing(role_platform)
gen answered_any_WCC = !missing(authority)
keep if answered_first


* Generate job type flags
g byte cust_int   = (contact_customers == "Yes") if !missing(contact_customers)
g byte high_val   = (cash_high_value == "Yes") if !missing(cash_high_value)
g byte hr = (position == "HR Professional") if !missing(position)
g        exp_hire = experience
replace  exp_hire = "0"  if exp_hire == "Less than 1 year"
replace  exp_hire = "16" if exp_hire == "16 + years"
replace  exp_hire = subinstr(exp_hire, "years","",.)
replace  exp_hire = subinstr(exp_hire, "year","",.)
destring exp_hire, replace

* Generate effective wage
gen effective_wage = 100-subsidy_rate
label var effective_wage "Effective Wage (%)"

* Generate crimetype hiring flags
local crimetype "prop_fin_fel violent_fel substance_fel prop_fin_mis violent_mis substance_mis"
foreach i in `crimetype'{
	assert ~mi(`i') if Finished == "True"
	g hire_`i' = (`i'=="Yes") if !missing(`i')
}
label variable hire_prop_fin_fel  "Would hire with property or financial felony"
label variable hire_violent_fel   "Would hire with violent felony"
label variable hire_substance_fel "Would hire with substance felony"
label variable hire_prop_fin_mis  "Would hire with property or financial misdemeanor"
label variable hire_violent_mis   "Would hire with violent misdemeanor"
label variable hire_substance_mis "Would hire with substance misdemeanor"

* Generate subsidy lumping 5 and 10 together
gen sub_lumped = subsidy_rate
replace sub_lumped = 7 if subsidy_rate==5 | subsidy_rate==10
la var sub_lumped "Subsidy rate with low levels lumped together"

* Process string 
local saveas: subinstr local saveas ".dta" "", all
tokenize `saveas', parse("\/")
local m_i = 1
while !missing("``=`m_i'+1''"){
	local path "`path'``m_i++''"
} 
local filename  "``m_i''"
di "`path' || `filename'"
save "`saveas'_wide_include_incomplete.dta" `replace'
preserve
keep if answered_treatment_demand
save "`saveas'_wide.dta" `replace'
restore, preserve

* RESHAPE LONG:
* Create new "long" dataset for crime regressions
keep FirstName LastName email mgr_id ///
subsidy_rate hire_sub ///
hire_prop_fin_fel hire_violent_fel hire_substance_fel ///
hire_prop_fin_mis hire_violent_mis hire_substance_mis firm_id effective_wage response_date answered_treatment_demand ///
cust_int high_val minutes authority n_jobs
g id = _n
reshape long hire_, i(id) j(question) string
rename hire_ hire

* Create indicator for baseline question
g byte baseline = (question == "sub")

* Create indicators for questions about conviction type
local crimetype "prop_fin_fel violent_fel substance_fel prop_fin_mis violent_mis substance_mis"
foreach i in `crimetype'{
	g byte `i' = (question == "`i'")
}

label variable violent_fel   "Violent Felony"
label variable prop_fin_fel  "Property Felony"
label variable substance_fel "Drug Felony"
label variable violent_mis   "Violent Misd."
label variable prop_fin_mis  "Property Misd."
label variable substance_mis "Drug Misd."

save "`saveas'_longcrime_include_incomplete.dta" `replace'

keep if answered_treatment_demand
save "`saveas'_longcrime.dta" `replace'

restore, preserve 

* RESHAPE LONG:
* IMPLIED OBSERVATION WITH INSURANCE/ JOB HISTORY/ UNEMPLOYMENT RATE/ TIME ELAPSED = 0
expand 2, generate(implied)
replace ins_cap = 0 if implied
replace ue_rate = 0 if implied
replace past_jobs = 0 if implied
replace years_elapsed = 0 if implied

foreach second_round in hire_unemp hire_hist hire_ins hire_clean {
	replace `second_round' = hire_sub if implied
}

label variable implied "all hire outcomes = hire_sub. additional policies (implied) 0"

* Save to long
save "`saveas'_long_include_incomplete.dta" `replace'
keep if answered_treatment_demand
save "`saveas'_long.dta" `replace'

end 


***** Program that processes Likert responses *****
program gen_likert 
syntax varname 

gen `varlist'_num = 1 if `varlist' == "Strongly disagree"
replace `varlist'_num = 2 if `varlist' == "Weakly disagree"
replace `varlist'_num = 3 if `varlist' == "Neutral"
replace `varlist'_num = 4 if `varlist' == "Weakly agree"
replace `varlist'_num = 5 if `varlist' == "Strongly agree"

end 

***** Program that uses OpenCage Geocoder API to get states from lat, long *****
cap program drop get_states
program get_states
/*
input: 		.dta
output: 	.dta mapping latitude/longitude to states

options:
+ update will update the mapping (i.e. to include new 
lat/longitudes). Since this calls the API, this step 
can be time consuming and should only be used when new 
observations have been added. 
Otherwise, the saved geocoding will be merged on

+ replace allows the file specified in (saveas)

* futureproofing note: 
* if API key expires, can make a new account/get new key at 
* https://opencagedata.com/
*/
syntax using/ , saveas(string) state_mapping_path(string) [replace]

local api_key = "6c4eec217abf4dd7ab362e225b429e46"

if !missing("`replace'") local replace ", replace"

* Make sure we got valid file paths before we start querying the API
foreach file in saveas state_mapping_path{
	cap: confirm new file `"``file''"'
	local error = _rc

	if `error' == 602 & missing("`replace'") & "`file'" == "saveas" {
		di as error `"``file'' exists. Try using replace or changing the file name."'
	}
	else if `error' != 0 &  `error' != 602 {
		di as error `"``file'' can't be written to"'
		exit `error'
	}
}
count 
use "`using'", clear 
count 

cap: confirm file `"`state_mapping_path'"'
if _rc == 0 local mapping "exists"

if "`mapping'" == "exists" { // i.e. some file exists
	merge m:1 LocationLatitude LocationLongitude using `"`state_mapping_path'"', keep(1 3)
	count 
	cap: assert _m == 3 if !missing(LocationLatitude) & !missing(LocationLongitude)

	* If we already have geo info for everyone (i.e. merged successfully), we're done
	if _rc == 0 {
		drop _m
		save "`saveas'" `replace'
		exit
	} 
	count 

	* Otherwise, we need to match anything that we don't already have information on
	keep if _m == 1 // keep if this lat/long weren't in the mapping 
	drop _m
}

* we only need the long/lat 
keep LocationLatitude LocationLongitude
duplicates drop 
drop if missing(LocationLatitude) | missing(LocationLongitude)

opencagegeo, key(`api_key') latitude(LocationLatitude) longitude(LocationLongitude) replace

rename g_* *

* If we already have a mapping, add this to it
if "`mapping'" == "exists" append using `"`state_mapping_path'"'

save `"`state_mapping_path'"', replace 

* Finally, merge the mapping back on to the input data
use "`using'", clear 
merge m:1 LocationLatitude LocationLongitude using `"`state_mapping_path'"'

count 
drop if _m == 2 
assert _m == 3 if !missing(LocationLatitude) & !missing(LocationLongitude)

drop _m

count 
save "`saveas'" `replace'

count 
end 


/*==================================================================
*** PREPARE RAW DATA ***
==================================================================*/

***** Platform Reference Data *****           
import excel using "$input_data/requestors_for_MIKE.xlsx", clear sheet(requestor details) firstrow
tostring requestorid, replace 
tempfile p1 
save `p1', replace 
import excel using "$input_data/requestors_for_MIKE.xlsx", clear sheet(requestor details 2) firstrow
append using `p1'

drop L M N O P Q R S T U V W X 

* Extract the industry class without numbers
gen industry1 = regexr(industry, "[0-9]*\-[0-9]* \- ", "")
drop industry

* Winsorize the firm size variable
winsor2 localemployees, trim cut(0 99.9)
gen platform_empsizeN = localemployees_tr

rename platform_empsizeN empsize1

* Create platform tenure variable
gen platform_tenure1 = mofd(date("4/1/2020", "MDY")) - mofd(dofc(createddate))

* Create industry, platform tenure, and firm size variables based on the mode of each variable within each requestor
egen industry = mode(industry1), by(email)
egen platform_tenure = mean(platform_tenure1), by(email)
egen empsize = mean(empsize1), by(email)

encode customer, gen(firm_id)
keep industry platform_tenure empsize email firm_id state
duplicates drop 

* Rename different industry titles to match with convention for data we will merge in
replace industry = "Finance, Insurance, \& Real Estate" if industry == "Finance, Insurance, Real Estate"
replace industry = "Transportation \& Public Utilities" if industry == "Transportation & Public Utilities"
replace industry = "Service" if industry == "Services"
replace industry = "Retail" if industry == "Retail Trade"

save "$output_data/platform_universe.dta", replace 


***** Prepare job/requestor metadata from Platform *****
* These data are spread over two sheets, so need to read in one sheet at a time
import excel "$input_data/requestor_emails_jobs.xlsx", sheet("Sheet1") firstrow clear 

tempfile requestor_crosswalk 
save `requestor_crosswalk', replace 

import excel "$input_data/requestor_emails_jobs.xlsx", sheet("Sheet2") firstrow clear 
append using `requestor_crosswalk'

tostring jobid, replace

save `requestor_crosswalk', replace

* Get job dates
tempfile dates
foreach j in 2018_1 2018_2 2019_1 2019_2 {
	import delimited "$input_data/jobs_`j'.csv", varnames(1) clear bindquotes(strict) maxquotedrows(unlimited)
	tostring job_id, replace
	if "`j'" != "2018_1" append using `dates'
	save `dates', replace
}

* Variable for having a job in 2018 to 2019 data
gen active = 1

* Clean data that was entered in wrong columns
* [code omitted due to presence of PII]

rename job_id jobid
destring jobid, gen(jobidn) force
drop if missing(jobidn)

keep jobid start_time state_code multi_day_job_request_id multi_day_job job_state active

* Create start time variable
tempvar started_time
gen `started_time' = subinstr(start_time,"+0000","",.)
gen start = clock(`started_time', "YMDhms")

* Restricting to most recent observation for each job
bysort jobid: egen most_recent = max(start)
keep if start == most_recent

* Generate indicator variable for whether a job is a multi day job
gen multi_day = multi_day_job == "multi day job"
drop multi_day_job

* Drop unneeded variables and drop duplicates
drop most_recent `started_time'
duplicates drop

* Merge in requestors crosswalk
merge 1:1 jobid using `requestor_crosswalk'
drop _merge
tempfile jobs 
save `jobs', replace 

* Get job categories
tempfile categories
foreach j in 2018 2019_1 2019_2 {
	import delimited "$input_data/ratings_`j'.csv", varnames(1) clear bindquotes(strict) // maxquotedrows(unlimited)
	if "`j'" != "2018" append using `categories'
	save `categories', replace
}

rename job_id jobid
destring jobid, gen(jobidn) force
drop if missing(jobidn)
drop if missing(created_at)

* Create another variable indicating activity based on there being an observation in the ratings data from 2018 to early 2020
gen active2 = 1

keep jobid category job_state created_at active2
recast str jobid

* Get most recent time that rating was created
tempvar cleaned_time
gen `cleaned_time' = subinstr(created_at,"+0000","",.)
gen post_time = clock(`cleaned_time', "YMDhms")
bysort jobid: egen most_recent = max(post_time)
keep if post_time == most_recent

* Drop unneeded variables and drop duplicates to keep only most recent observations within each job
drop most_recent `cleaned_time'
duplicates drop

* Merge in jobs main data
merge 1:1 jobid using `jobs'
drop _merge 

* Replace post_time with start time if post_start is missing
replace post_time = start if missing(post_time) & !missing(start_time)

tempfile main 
save `main', replace 

* Get employment size bins
tempfile empsize
foreach j in 2018 2019_1 2019_2 {
	import delimited "$input_data/customer_details_`j'.csv", varnames(1) clear bindquotes(strict)

	if "`j'" != "2018" append using `empsize'
	save `empsize', replace
}

rename job_id jobid  

tostring jobid, replace 
drop if missing(jobid)

* Keep observations that match or were in main
merge 1:1 jobid using `main', nogen keep(2 3)
save `main', replace 

* Get the firms for these requestors
merge m:1 email using "$input_data/firm_covariates.dta" 
label variable estab_name "Hiring Account Name (as entered)"

* This takes lots of weird values, let's make them nice
rename firm_id firm_id1
egen firm_id = group(firm_id1)
drop firm_id1

label variable firm_id "Group together large firms with many accounts."

preserve 

keep firm_id email 
tempfile firm_cw 
save `firm_cw', replace 

restore

* Create variables for mode of firm modal category, firm size range, and category for each firm
egen modal_category_admin = mode(primary_job_category), by(firm_id) minmode
egen emp_range_admin = mode(num_employees_range), by(firm_id) minmode
egen modal_category = mode(category), by(firm_id) minmode

* Create variable indicating whether a job was not filled
gen vacant_job = job_state != "approved" & job_state != "completed" if !missing(job_state)

* Create date variables for when job rating was posted
gen march = post_time >= clock("3-1-2020", "MDY") if !missing(post_time)
gen since_feb = post_time >= clock("2-1-2020", "MDY") if !missing(post_time)
gen all2020 = post_time >= clock("1-1-2020", "MDY") if !missing(post_time)
gen jan2019 = post_time >= clock("1-1-2019", "MDY") & post_time < clock("2-1-2019", "MDY") if !missing(post_time)
gen feb2019 = post_time >= clock("2-1-2019", "MDY") & post_time < clock("3-1-2019", "MDY") if !missing(post_time)
gen mar2019 = post_time >= clock("3-1-2019", "MDY") & post_time < clock("4-1-2019", "MDY") if !missing(post_time)
gen apr2019 = post_time >= clock("4-1-2019", "MDY") & post_time < clock("5-1-2019", "MDY") if !missing(post_time)
gen may2019 = post_time >= clock("5-1-2019", "MDY") & post_time < clock("6-1-2019", "MDY") if !missing(post_time)
gen jun2019 = post_time >= clock("6-1-2019", "MDY") & post_time < clock("7-1-2019", "MDY") if !missing(post_time)
gen jul2019 = post_time >= clock("7-1-2019", "MDY") & post_time < clock("8-1-2019", "MDY") if !missing(post_time)
gen aug2019 = post_time >= clock("8-1-2019", "MDY") & post_time < clock("9-1-2019", "MDY") if !missing(post_time)
gen sep2019 = post_time >= clock("9-1-2019", "MDY") & post_time < clock("10-1-2019", "MDY") if !missing(post_time)
gen oct2019 = post_time >= clock("10-1-2019", "MDY") & post_time < clock("11-1-2019", "MDY") if !missing(post_time)
gen nov2019 = post_time >= clock("11-1-2019", "MDY") & post_time < clock("12-1-2019", "MDY") if !missing(post_time)
gen dec2019 = post_time >= clock("12-1-2019", "MDY") & post_time < clock("1-1-2020", "MDY") if !missing(post_time)
gen jan2020 = post_time >= clock("1-1-2020", "MDY") & post_time < clock("2-1-2020", "MDY") if !missing(post_time)
gen all2019 = post_time >= clock("1-1-2019", "MDY") & post_time < clock("1-1-2020", "MDY") if !missing(post_time)

duplicates tag jobid, gen(tag)
egen custID_SD = sd(customer_id), by(firm_id)

* Create new jobid variable 
egen jobidn = group(jobid)

* Generate indicator for approved or completed jobs
bysort jobidn: gen approved_job = 1 if (job_state == "approved" | job_state == "completed") & _n == 1

* Generate indicator for having any job
gen has_job = 1 if !missing(post_time) | !missing(active) | !missing(active2)

* Collapse by firm
collapse vacant_job (firstnm) state_code custID_SD customer_id modal_category emp_range_admin modal_category_admin has_job (sum) approved_job march since_feb all2020 jan2019-dec2019 all2019 jan2020 multiday_count=multi_day (count) n_jobs = jobidn, by(firm_id)
replace multiday_count = . if has_job != 1

* Merge in firm email addresses
merge 1:m firm_id using `firm_cw', nogen 
duplicates drop 

save "$output_data/platform_requestor_info.dta", replace 


***** Prepare Infogroup Reference Data *****
* takes long time to run
import delimited "$input_data/2019_Business_Academic_QCQ.txt", clear varnames(1) 

**** SIZE 
gen infogroup_empsizeN =  2.5 if locationemployeesizecode == "A"
replace infogroup_empsizeN =  7 if locationemployeesizecode == "B"
replace infogroup_empsizeN =  14.5 if locationemployeesizecode == "C"
replace infogroup_empsizeN =  34.5 if locationemployeesizecode == "D"
replace infogroup_empsizeN =  74.5 if locationemployeesizecode == "E"
replace infogroup_empsizeN =  174.5 if locationemployeesizecode == "F"
replace infogroup_empsizeN =  374.5 if locationemployeesizecode == "G"
replace infogroup_empsizeN =  699.5 if locationemployeesizecode == "H"
replace infogroup_empsizeN =  3000 if locationemployeesizecode == "I"
replace infogroup_empsizeN =  7500 if locationemployeesizecode == "J"
replace infogroup_empsizeN =  10000 if locationemployeesizecode == "K"

rename infogroup_empsizeN empsize

**** AGE 
gen estab_age = 2020 - yearestablished

**** INDUSTRY 
cap: tostring siccode, replace 
gen sic_2dig = substr(siccode,1,2)
gen sic_1dig = substr(siccode,1,1)
gen sic = "Agriculture, Forestry \& Fishing" if inlist(sic_2dig,"01","02","07","08","09")
replace sic = "Mining" if inlist(sic_2dig,"10","11","12","13","14")
replace sic = "Construction" if inlist(sic_2dig,"15","16","17")
replace sic = "Manufacturing" if inlist(sic_1dig,"2","3")
replace sic = "Transportation \& Public Utilities" if inlist(sic_1dig,"4")
replace sic = "Retail" if inlist(sic_1dig,"5")
replace sic = "Wholesale Trade" if inlist(sic_2dig,"50","51")
replace sic = "Finance, Insurance, \& Real Estate" if inlist(sic_1dig,"6")
replace sic = "Service" if inlist(sic_1dig,"7","8")
replace sic = "Public Administration" if inlist(sic_1dig,"9")
replace sic = "Nonclassifiable" if inlist(sic_2dig,"99")

keep company sic estab_age empsize

duplicates drop 

strcompress
if _rc != 0 net install strcompress, from(https://raw.github.com/lukestein/strcompress/master/)

save "$output_data/infogroup_universe.dta", replace 


/*==================================================================
Process raw survey data
==================================================================*/
*** Read in the raw survey data
import_survey_excel using "$input_data/Platform Criminal Applicants Survey - 18 Dec - Copy_May 13, 2020_10.19.xlsx", /// 
saveas("$output_data/distribution1.dta") replace 

import_survey_excel using "$input_data/Platform Criminal Applicants Survey - 18 Dec - Copy - LOW RATINGS_May 13, 2020_10.18.xlsx", /// 
saveas("$output_data/distribution2.dta") replace 

import_survey_excel using "$input_data/Platform Criminal Applicants Survey - 18 Dec - Copy - NO DISCOUNT_May 13, 2020_10.18.xlsx", /// 
saveas("$output_data/nodiscount.dta") replace 

import_survey_excel using  "$input_data/Platform Criminal Applicants Survey - Fall 20_October 12, 2020_14.16.xlsx", /// 
saveas("$output_data/postcovid.dta") replace fall

*** ACCOUNT FOR DIFFERENCES ACCROSS THE SURVEY TYPES
use "$output_data/distribution1.dta", clear 
gen displayed_zero = 1 if subsidy_rate == 0
gen displayed_zero_sample = 1
gen info_type = 1 
gen survey_arm = 1

append using "$output_data/distribution2.dta"
append using "$output_data/nodiscount.dta"

gen postcovid = 0 
append using "$output_data/postcovid.dta"
replace postcovid = 1 if missing(postcovid)

** Drop duplicates, keeping the version with least missing answers
bysort email: egen dups = count(StartDate)
egen missing_answers = rowmiss(role_platform-steal_cause_damage)

bysort email (missing_answers dups StartDate): gen best_coverage = _n == 1

keep if best_coverage ==1 
drop missing_answers dups

save "$output_data/Appended_Surveys.dta", replace

* Fill in missings/ survey characteristics 
replace survey_arm = 2 if missing(survey_arm)
replace info_type = 2 if missing(info_type) 
replace displayed_zero_sample = displayed_zero_sample == 1
replace displayed_zero = displayed_zero == 1 if subsidy_rate == 0

label define information 1 "4 or 5 Stars" 2 "Low Rating/ No Show"
label values info_type information

label define zeroes1 0 "No Subsidy Displayed" 1 "0\% Subsidy Explicitly Displayed"
label define zeroes2 0 "Survey: No Subsidy Displayed" 1 "Survey: Explicitly Displayed Zero"

label values displayed_zero zeroes1
label values displayed_zero_sample zeroes2

label variable displayed_zero "If subsidy_rate is 0, was this displayed to respondent? (missing for nonzero subsidy)"
label variable displayed_zero_sample "Was the respondent assigned to the survey arm where zeroes were displayed?"

save "$output_data/combined_nogeocode.dta", replace 

* the API has a limited number of daily queries, so we don't want to call the API every time we change something else 
* in the code. every time it runs, we save the geocode mapping in $output_data, see get_states program above for details
get_states using "$output_data/combined_nogeocode.dta", saveas("$output_data/combined_distributions.dta") state_mapping_path("$output_data/latlong_geocode.dta") replace 

snapshot save


/*==================================================================
Create make_business_name_cw.R, which uses a fuzzy merge
to match business names in the survey to clean firm names in the
infogroup data. 

Read in the infogroup data, keep only the data that we will
use, attach the crosswalk so that we can merge it directly onto
the main survey panel, and save infogroup_data.dta in 
data/relational.
==================================================================*/


* preprocess survey names 
* start with the survey 
use "$output_data/combined_distributions.dta", clear 
cap drop company
keep if country == "United States of America"

merge m:1 email using "$input_data/firm_covariates.dta", keepusing(estab_name) keep(3) nogen 

label var estab_name "Requestor Establishment (Raw)"

* standardize names with NBER scripts 
gen standard_name = trim(estab_name)
replace standard_name=upper(standard_name)
replace standard_name = subinstr(standard_name,"-ADR"," ",30)
replace standard_name = subinstr(standard_name,"-ADS"," ",30)
replace standard_name = subinstr(standard_name,"-CL A "," ",30)
replace standard_name = subinstr(standard_name,"-CL B "," ",30)
replace standard_name = subinstr(standard_name,"-CONN "," ",30)
replace standard_name = subinstr(standard_name,"-CONSOLIDATED "," ",30)
replace standard_name = subinstr(standard_name,"-DEL "," ",30)
replace standard_name = subinstr(standard_name,"-DE "," ",30)
replace standard_name = subinstr(standard_name,"-NY SHARES "," ",30)
replace standard_name = subinstr(standard_name,"-OLD "," ",30)
replace standard_name = subinstr(standard_name,"-ORD "," ",30)
replace standard_name = subinstr(standard_name,"-PRE AMEND "," ",30)      /* JB */
replace standard_name = subinstr(standard_name,"-PRE DIVEST "," ",30)     /* JB */
replace standard_name = subinstr(standard_name,"-PREAMEND "," ",30)       /* JB */
replace standard_name = subinstr(standard_name,"-PREDIVEST "," ",30)      /* JB */
replace standard_name = subinstr(standard_name,"-PROJ "," ",30)       /* JB */
replace standard_name = subinstr(standard_name,"-PROJECTED "," ",30)      /* JB */
replace standard_name = subinstr(standard_name,"-PREF "," ",30)       /* JB */
replace standard_name = subinstr(standard_name,"-PRE FASB "," ",30)       /* JB */
replace standard_name = subinstr(standard_name,"-PREFASB "," ",30)        /* JB */
replace standard_name = subinstr(standard_name,"-PRO FORMA "," ",30)
replace standard_name = subinstr(standard_name,"- PRO FORMA "," ",30)
replace standard_name = subinstr(standard_name,"-PRO FORMA1 "," ",30)
replace standard_name = subinstr(standard_name,"-PRO FORMA2 "," ",30)
replace standard_name = subinstr(standard_name,"-PRO FORMA3 "," ",30)
replace standard_name = subinstr(standard_name,"-REDH "," ",30)
replace standard_name = subinstr(standard_name,"-SER A COM "," ",30)
replace standard_name = subinstr(standard_name,"-SER A "," ",30)
replace standard_name = subinstr(standard_name,"-SPN "," ",30)
replace standard_name = subinstr(standard_name," ACCPTNCE "," ACCEPTANCE ",30)
replace standard_name = subinstr(standard_name," BANCORPORATION "," BANCORP ",30)
replace standard_name = subinstr(standard_name," BANCORPORTN "," BANCORP ",30)
replace standard_name = subinstr(standard_name," BANCRP "," BANCORP ",30)
replace standard_name = subinstr(standard_name," BNCSHRS "," BANCSHARES ",30)
replace standard_name = subinstr(standard_name," BRWG "," BREWING ",30)
replace standard_name = subinstr(standard_name," CHEVRONTEXACO "," CHEVRON TEXACO ",30)
replace standard_name = subinstr(standard_name," CHSE "," CHASE ",30)
replace standard_name = subinstr(standard_name," COMMN "," COMMUNICATION ",30)
replace standard_name = subinstr(standard_name," COMMUN "," COMMUNICATION ",30)
replace standard_name = subinstr(standard_name," COMMUNICATNS "," COMMUNICATION ",30)
replace standard_name = subinstr(standard_name," COMMUNICATIONS "," COMMUNICATION ",30)
replace standard_name = subinstr(standard_name," DPT STS "," DEPT STORES ",30)
replace standard_name = subinstr(standard_name," DPT "," DEPT ",30)
replace standard_name = subinstr(standard_name," ENRGY "," ENERGY ",30)
replace standard_name = subinstr(standard_name," FINL "," FINANCIAL ",30)
replace standard_name = subinstr(standard_name," FNCL "," FINANCIAL ",30)
replace standard_name = subinstr(standard_name," GRP "," GROUP ",30)
replace standard_name = subinstr(standard_name," HLDGS "," HOLDINGS ",30)  
replace standard_name = subinstr(standard_name," HLDG "," HOLDING ",30)
replace standard_name = subinstr(standard_name," HLT NTWK "," HEALTH NETWORK ",30)
replace standard_name = subinstr(standard_name," HTLS RES "," HOTELS & RESORTS ",30)
replace standard_name = subinstr(standard_name," HLTH "," HEALTH ",30)
replace standard_name = subinstr(standard_name," INTRTECHNLGY "," INTERTECHNOLOGY ",30)
replace standard_name = subinstr(standard_name," JPMORGAN "," J P MORGAN ",30)
replace standard_name = subinstr(standard_name," MED OPTIC "," MEDICAL OPTICS ",30)
replace standard_name = subinstr(standard_name," MINNESOTA MINING AND MANUFACTURING COMPANY "," 3M COMPANY ",30)
replace standard_name = subinstr(standard_name," NAT RES "," NATURAL RESOURCES ",30)
replace standard_name = subinstr(standard_name," NETWRKS "," NETWORK ",30)
replace standard_name = subinstr(standard_name," PHARMACTICALS "," PHARM ",30)
replace standard_name = subinstr(standard_name," PHARMACT "," PHARM ",30)
replace standard_name = subinstr(standard_name," PPTYS TST "," PROPERTIES TRUST ",30)
replace standard_name = subinstr(standard_name," PPTY "," PROPERTY ",30)
replace standard_name = subinstr(standard_name," PROPERTY TR "," PROPERTY TRUST ",30)
replace standard_name = subinstr(standard_name," PAC RAILWY "," PACIFIC RAILWAY ",30)
replace standard_name = subinstr(standard_name," SEMICONDTR "," SEMICONDUCTOR ",30)
replace standard_name = subinstr(standard_name," SOLU "," SOLUTIONS ",30)
replace standard_name = subinstr(standard_name," ST & ALMN "," STEEL & ALUMINUM ",30)
replace standard_name = subinstr(standard_name," STD "," STANDARD ",30)
replace standard_name = subinstr(standard_name," TECHNOLGS "," TECH ",30)
replace standard_name = subinstr(standard_name," TECHNOL "," TECH ",30)
replace standard_name = subinstr(standard_name," TRANSPORTATN "," TRANSPORTATION ",30)
replace standard_name = subinstr(standard_name," ADVERTSG "," ADVERTISING ",30)  /* JB */
replace standard_name = subinstr(standard_name," ADVNTGE "," ADVANTAGE ",30)     /* JB */
replace standard_name = subinstr(standard_name," AIRLN "," AIRLINES ",30)    /* JB */
replace standard_name = subinstr(standard_name," AIRLS "," AIRLINES ",30)    /* JB */
replace standard_name = subinstr(standard_name," AM "," AMERICA ",30)    /* JB */
replace standard_name = subinstr(standard_name," AMER "," AMERICAN ",30)     /* JB */
replace standard_name = subinstr(standard_name," APPLIAN "," APPLIANCES ",30)    /* JB */
replace standard_name = subinstr(standard_name," APPLICTN "," APPLICATIONS ",30)     /* JB */
replace standard_name = subinstr(standard_name," ARCHTCTS "," ARCHITECTS ",30)   /* JB */
replace standard_name = subinstr(standard_name," ASSD "," ASSOCIATED ",30)   /* JB */
replace standard_name = subinstr(standard_name," ASSOC "," ASSOCIATES ",30)  /* JB */
replace standard_name = subinstr(standard_name," ASSOCS "," ASSOCIATES ",30)     /* JB */
replace standard_name = subinstr(standard_name," ATOMC "," ATOMIC ",30)  /* JB */
replace standard_name = subinstr(standard_name," BANCSH "," BANCSHARES ",30)     /* JB */
replace standard_name = subinstr(standard_name," BANCSHR "," BANCSHARES ",30)    /* JB */
replace standard_name = subinstr(standard_name," BCSHS "," BANCSHARES ",30)  /* JB */
replace standard_name = subinstr(standard_name," BK "," BANK ",30)   /* JB */
replace standard_name = subinstr(standard_name," BLDGS "," BUILDINGS ",30)   /* JB */
replace standard_name = subinstr(standard_name," BROADCASTG "," BROADCASTING ",30)   /* JB */
replace standard_name = subinstr(standard_name," BTLNG "," BOTTLING ",30)    /* JB */
replace standard_name = subinstr(standard_name," CBLVISION "," CABLEVISION ",30)     /* JB */
replace standard_name = subinstr(standard_name," CENTRS "," CENTERS ",30)    /* JB */
replace standard_name = subinstr(standard_name," CHAMPNSHIP "," CHAMPIONSHIP ",30)   /* JB */
replace standard_name = subinstr(standard_name," CMMNCTNS "," COMMUNICATIONS ",30)   /* JB */
replace standard_name = subinstr(standard_name," CNVRSION "," CONVERSION ",30)   /* JB */
replace standard_name = subinstr(standard_name," COFF "," COFFEE ",30)   /* JB */
replace standard_name = subinstr(standard_name," COMM "," COMMUNICATIONS ",30)   /* JB */
replace standard_name = subinstr(standard_name," COMMUN "," COMMUNICATIONS ",30)     /* JB */
replace standard_name = subinstr(standard_name," COMMUNCTN "," COMMUNICATIONS ",30)  /* JB */
replace standard_name = subinstr(standard_name," COMMUNICTNS "," COMMUNICATIONS ",30)    /* JB */
replace standard_name = subinstr(standard_name," COMP "," COMPUTERS ",30)    /* JB */
replace standard_name = subinstr(standard_name," COMPUTR "," COMPUTER ",30)  /* JB */
replace standard_name = subinstr(standard_name," CONFERENCG "," CONFERENCING ",30)   /* JB */
replace standard_name = subinstr(standard_name," CONSTRN "," CONSTR ",30)    /* JB */
replace standard_name = subinstr(standard_name," CONTL "," CONTINENTAL ",30)     /* JB */
replace standard_name = subinstr(standard_name," CONTNT "," CONTINENTAL ",30)    /* JB */
replace standard_name = subinstr(standard_name," CONTRL "," CONTROL ",30)    /* JB */
replace standard_name = subinstr(standard_name," CONTRL "," CONTROL ",30)    /* JB */
replace standard_name = subinstr(standard_name," CTR "," CENTER ",30)    /* JB */
replace standard_name = subinstr(standard_name," CTRS "," CENTERS ",30)  /* JB */
replace standard_name = subinstr(standard_name," CVRGS "," COVERINGS ",30)   /* JB */
replace standard_name = subinstr(standard_name," DEV "," DEVELOPMENT ",30)   /* JB */
replace standard_name = subinstr(standard_name," DEVL "," DEVELOPMENT ",30)  /* JB */
replace standard_name = subinstr(standard_name," DEVLP "," DEVELOPMENT ",30)     /* JB */
replace standard_name = subinstr(standard_name," DISTR "," DISTRIBUTION ",30)    /* JB */
replace standard_name = subinstr(standard_name," DISTRIBUT "," DISTRIBUTION ",30)    /* JB */
replace standard_name = subinstr(standard_name," DISTRIBUTN "," DISTRIBUTION ",30)   /* JB */
replace standard_name = subinstr(standard_name," ELCTRNCS "," ELECTRONICS ",30)  /* JB */
replace standard_name = subinstr(standard_name," ELECTR "," ELECTRONICS ",30)    /* JB */
replace standard_name = subinstr(standard_name," ENGNRD "," ENGINEERED ",30)     /* JB */
replace standard_name = subinstr(standard_name," ENMT "," ENTERTAINMENT ",30)    /* JB */
replace standard_name = subinstr(standard_name," ENTERTAIN "," ENTERTAINMENT ",30)   /* JB */
replace standard_name = subinstr(standard_name," ENTERTNMNT "," ENTERTAINMENT ",30)  /* JB */
replace standard_name = subinstr(standard_name," ENTMNT "," ENTERTAINMENT ",30)  /* JB */
replace standard_name = subinstr(standard_name," ENTMT "," ENTERTAINMENT ",30)   /* JB */
replace standard_name = subinstr(standard_name," ENTRPR "," ENTERPRISES ",30)    /* JB */
replace standard_name = subinstr(standard_name," ENTRPRISE "," ENTERPRISES ",30)     /* JB */
replace standard_name = subinstr(standard_name," ENTRPRS "," ENTERPRISES ",30)   /* JB */
replace standard_name = subinstr(standard_name," ENVIR "," ENVIRONMENTAL ",30)   /* JB */
replace standard_name = subinstr(standard_name," ENVIRNMNTL "," ENVIRONMENTAL ",30)  /* JB */
replace standard_name = subinstr(standard_name," ENVR "," ENVIRONMENTAL ",30)    /* JB */
replace standard_name = subinstr(standard_name," EQUIPMT "," EQUIPMENT ",30)     /* JB */
replace standard_name = subinstr(standard_name," EXCHG "," EXCHANGE ",30)    /* JB */
replace standard_name = subinstr(standard_name," EXPLOR "," EXPLORATION ",30)    /* JB */
replace standard_name = subinstr(standard_name," FNDG "," FUNDING ",30)  /* JB */
replace standard_name = subinstr(standard_name," GLD "," GOLD ",30)  /* JB */
replace standard_name = subinstr(standard_name," GP "," GROUP ",30)  /* JB */
replace standard_name = subinstr(standard_name," HLDS "," HLDGS ",30)    /* JB */
replace standard_name = subinstr(standard_name," HLTHCARE "," HEALTHCARE ",30)   /* JB */
replace standard_name = subinstr(standard_name," HLTHCR "," HEALTHCARE ",30)     /* JB */
replace standard_name = subinstr(standard_name," HOMEMDE "," HOMEMADE ",30)  /* JB */
replace standard_name = subinstr(standard_name," HSPTL "," HOSPITAL ",30)    /* JB */
replace standard_name = subinstr(standard_name," ILLUM "," ILLUMINATION ",30)    /* JB */
replace standard_name = subinstr(standard_name," INDL "," INDUSTRIAL ",30)   /* JB */
replace standard_name = subinstr(standard_name," INDPT "," INDEPENDENT ",30)     /* JB */
replace standard_name = subinstr(standard_name," INDTY "," INDEMNITY ",30)   /* JB */
replace standard_name = subinstr(standard_name," INFORMATN "," INFO ",30)    /* JB */
replace standard_name = subinstr(standard_name," INSTNS "," INSTITUTIONS ",30)   /* JB */
replace standard_name = subinstr(standard_name," INSTRUMEN "," INSTRUMENTS ",30)     /* JB */
replace standard_name = subinstr(standard_name," INSTRUMNT "," INSTRUMENTS ",30)     /* JB */
replace standard_name = subinstr(standard_name," INTEGRATRS "," INTEGRATORS ",30)    /* JB */
replace standard_name = subinstr(standard_name," INTERNATIONL "," INT ",30)  /* JB */
replace standard_name = subinstr(standard_name," INVS "," INVESTMENTS ",30)  /* JB */
replace standard_name = subinstr(standard_name," INVT "," INVESTMENT ",30)   /* JB */
replace standard_name = subinstr(standard_name," MANAGEMNT "," MANAGEMENT ",30)  /* JB */
replace standard_name = subinstr(standard_name," MANAGMNT "," MANAGEMENT ",30)   /* JB */
replace standard_name = subinstr(standard_name," MANHATN "," MANHATTAN ",30)     /* JB */
replace standard_name = subinstr(standard_name," MANUF "," MFG ",30)     /* JB */
replace standard_name = subinstr(standard_name," MDSE "," MERCHANDISING ",30)    /* JB */
replace standard_name = subinstr(standard_name," MEASURMNT "," MEASUREMENT ",30)     /* JB */
replace standard_name = subinstr(standard_name," MERCHNDSNG "," MERCHANDISING ",30)  /* JB */
replace standard_name = subinstr(standard_name," MGMT "," MANAGEMENT ",30)   /* JB */
replace standard_name = subinstr(standard_name," MGRS "," MANAGERS ",30)     /* JB */
replace standard_name = subinstr(standard_name," MGT "," MANAGEMENT ",30)    /* JB */
replace standard_name = subinstr(standard_name," MICROWAV "," MICROWAVE ",30)    /* JB */
replace standard_name = subinstr(standard_name," MKTS "," MARKETS ",30)  /* JB */
replace standard_name = subinstr(standard_name," MLTIMEDIA "," MULTIMEDIA ",30)  /* JB */
replace standard_name = subinstr(standard_name," MTG "," MORTGAGE ",30)  /* JB */
replace standard_name = subinstr(standard_name," MTNS "," MOUTAINS ",30)     /* JB */
replace standard_name = subinstr(standard_name," MTRS "," MOTORS ",30)   /* JB */
replace standard_name = subinstr(standard_name," NETWRK "," NETWORK ",30)    /* JB */
replace standard_name = subinstr(standard_name," NOWEST "," NORTHWEST ",30)  /* JB */
replace standard_name = subinstr(standard_name," NTWRK "," NETWORK ",30)     /* JB */
replace standard_name = subinstr(standard_name," OFFSHRE "," OFFSHORE ",30)  /* JB */
replace standard_name = subinstr(standard_name," ORGANIZTN "," ORG ",30)     /* JB */
replace standard_name = subinstr(standard_name," PBLG "," PUBLISHING ",30)   /* JB */
replace standard_name = subinstr(standard_name," PHARMACEUTICL "," PHARM ",30)   /* JB */
replace standard_name = subinstr(standard_name," PLAST "," PLASTICS ",30)    /* JB */
replace standard_name = subinstr(standard_name," PPTYS "," PROPERTIES ",30)  /* JB */
replace standard_name = subinstr(standard_name," PRODS "," PROD ",30)    /* JB */
replace standard_name = subinstr(standard_name," PRODTN "," PRODN ",30)  /* JB */
replace standard_name = subinstr(standard_name," PRODUCTN "," PRODN ",30)    /* JB */
replace standard_name = subinstr(standard_name," PRPANE "," PROPANE ",30)    /* JB */
replace standard_name = subinstr(standard_name," PTS "," PARTS ",30)     /* JB */
replace standard_name = subinstr(standard_name," PUBLISH "," PUBLISHING ",30)    /* JB */
replace standard_name = subinstr(standard_name," PUBLSHING "," PUBLISHING ",30)  /* JB */
replace standard_name = subinstr(standard_name," PUBN "," PUBLICATIONS ",30)     /* JB */
replace standard_name = subinstr(standard_name," PUBNS "," PUBLICATIONS ",30)    /* JB */
replace standard_name = subinstr(standard_name," PWR "," POWER ",30)     /* JB */
replace standard_name = subinstr(standard_name," RAILRD "," RAILROAD ",30)   /* JB */
replace standard_name = subinstr(standard_name," RECREATN "," RECREATION ",30)   /* JB */
replace standard_name = subinstr(standard_name," RECYCL "," RECYCLING ",30)  /* JB */
replace standard_name = subinstr(standard_name," REFIN "," REFINING ",30)    /* JB */
replace standard_name = subinstr(standard_name," REFNG "," REFINING ",30)    /* JB */
replace standard_name = subinstr(standard_name," RESTR "," RESTAURANT ",30)  /* JB */
replace standard_name = subinstr(standard_name," RESTS "," RESTAURANTS ",30)     /* JB */
replace standard_name = subinstr(standard_name," RETAILNG "," RETAILING ",30)    /* JB */
replace standard_name = subinstr(standard_name," RLTY "," REALTY ",30)   /* JB */
replace standard_name = subinstr(standard_name," RR "," RAILROAD ",30)   /* JB */
replace standard_name = subinstr(standard_name," RSCH "," RESEARCH ",30)     /* JB */
replace standard_name = subinstr(standard_name," RTNG "," RATING ",30)   /* JB */
replace standard_name = subinstr(standard_name," SCIENTIF "," SCIENTIFIC ",30)   /* JB */
replace standard_name = subinstr(standard_name," SERV "," SERVICES ",30)     /* JB */
replace standard_name = subinstr(standard_name," SLTNS "," SOLUTIONS ",30)   /* JB */
replace standard_name = subinstr(standard_name," SOFTWRE "," SOFTWARE ",30)  /* JB */
replace standard_name = subinstr(standard_name," SOLTNS "," SOLUTIONS ",30)  /* JB */
replace standard_name = subinstr(standard_name," SOLUT "," SOLUTIONS ",30)   /* JB */
replace standard_name = subinstr(standard_name," SRVC "," SERVICES ",30)     /* JB */
replace standard_name = subinstr(standard_name," SRVCS "," SERVICES ",30)    /* JB */
replace standard_name = subinstr(standard_name," STEAKHSE "," STEAKHOUSE ",30)   /* JB */
replace standard_name = subinstr(standard_name," STHWST "," SOUTHWEST ",30)  /* JB */
replace standard_name = subinstr(standard_name," STL "," STEEL ",30)     /* JB */
replace standard_name = subinstr(standard_name," STRS "," STORES ",30)   /* JB */
replace standard_name = subinstr(standard_name," SUP "," SUPPLY ",30)    /* JB */
replace standard_name = subinstr(standard_name," SUPERMKTS "," SUPERMARKETS ",30)    /* JB */
replace standard_name = subinstr(standard_name," SUPP "," SUPPLIES ",30)     /* JB */
replace standard_name = subinstr(standard_name," SURVYS "," SURVEYS ",30)    /* JB */
replace standard_name = subinstr(standard_name," SVC "," SERVICES ",30)  /* JB */
replace standard_name = subinstr(standard_name," SVCS "," SERVICES ",30)     /* JB */
replace standard_name = subinstr(standard_name," SVSC "," SERVICES ",30)     /* JB */
replace standard_name = subinstr(standard_name," SYS "," SYSTEMS ",30)   /* JB */
replace standard_name = subinstr(standard_name," SYSTM "," SYSTEMS ",30)     /* JB */
replace standard_name = subinstr(standard_name," TCHNLGY "," TECH ",30)  /* JB */
replace standard_name = subinstr(standard_name," TECHNGS "," TECHNOLOGIES ",30)  /* JB */
replace standard_name = subinstr(standard_name," TECHNL "," TECH ",30)   /* JB */
replace standard_name = subinstr(standard_name," TECHNLGIES "," TECHNOLOGIES ",30)   /* JB */
replace standard_name = subinstr(standard_name," TEL "," TELEPHONE ",30)     /* JB */
replace standard_name = subinstr(standard_name," TELE-COMM "," TELECOMMUNICATIONS ",30)  /* JB */
replace standard_name = subinstr(standard_name," TELE-COMMUN "," TELECOMMUNICATIONS ",30)    /* JB */
replace standard_name = subinstr(standard_name," TELECOMMS "," TELECOMMUNICATIONS ",30)  /* JB */
replace standard_name = subinstr(standard_name," TELECONFERENC "," TELECONFERENCING ",30)    /* JB */
replace standard_name = subinstr(standard_name," TELEG "," TELEGRAPH ",30)   /* JB */
replace standard_name = subinstr(standard_name," TELEGR "," TELEGRAPH ",30)  /* JB */
replace standard_name = subinstr(standard_name," TELVSN "," TELEVISION ",30)     /* JB */
replace standard_name = subinstr(standard_name," TR "," TRUST ",30)  /* JB */
replace standard_name = subinstr(standard_name," TRANSN "," TRANSPORTATION ",30)     /* JB */
replace standard_name = subinstr(standard_name," TRANSPORTN "," TRANSPORTATION ",30)     /* JB */
replace standard_name = subinstr(standard_name," TRNSACTN "," TRANSACTION ",30)  /* JB */
replace standard_name = subinstr(standard_name," UTD "," UNITED ",30)    /* JB */
replace standard_name = subinstr(standard_name," WSTN "," WESTERN ",30)  /* JB */
replace standard_name = subinstr(standard_name," WTR "," WATER ",30)     /* JB */
replace standard_name=" U.S. PHILIPS CORPORATION " if trim(standard_name)=="NORTH AMERICAN PHILIPS CORP"
replace standard_name=" A. L. WILLIAMS CORP. " if trim(standard_name)=="WILLIAMS (A.L.) CORP"
replace standard_name=" B. F. GOODRICH CO. " if trim(standard_name)=="GOODRICH CORP"
replace standard_name=" BELL + HOWELL COMPANY " if trim(standard_name)=="BELL & HOWELL OPERATING CO"
replace standard_name=" BENDIX CORPORATION(NOW ALLIED-SIGNAL INC.) " if trim(standard_name)=="BENDIX CORP"
replace standard_name=" BORG-WARNER CORPORATION " if trim(standard_name)=="BORGWARNER INC"
replace standard_name=" CHRYSLER MOTORS CORPORATION " if trim(standard_name)=="CHRYSLER CORP"
replace standard_name=" CISCO TECHNOLOGY, INC. " if trim(standard_name)=="CISCO SYSTEMS INC"
replace standard_name=" DELL PRODUCTS, L.P. " if trim(standard_name)=="DELL INC"
replace standard_name=" DELPHI TECHNOLOGIES, INC. " if trim(standard_name)=="DELPHI CORP"
replace standard_name=" E. I. DU PONT DE NEMOURS AND COMPANY " if trim(standard_name)=="DU PONT (E I) DE NEMOURS"
replace standard_name=" E. R. SQUIBB + SONS, INC. " if trim(standard_name)=="SQUIBB CORP"
replace standard_name=" ELI LILLY AND COMPANY " if trim(standard_name)=="LILLY (ELI) & CO"
replace standard_name=" G. D. SEARLE & CO. " if trim(standard_name)=="SEARLE (G.D.) & CO"
replace standard_name=" MINNESOTA MINING AND MANUFACTURING COMPANY " if trim(standard_name)=="3M CO"
replace standard_name=" OWENS-CORNING FIBERGLAS CORPORATION " if trim(standard_name)=="OWENS CORNING"
replace standard_name=" SCHLUMBERGER TECHNOLOGY CORPORATION " if trim(standard_name)=="SCHLUMBERGER LTD"
replace standard_name=" SCI-MED LIFE SYSTEMS, INC. " if trim(standard_name)=="SICMED LIFE SYSTEMS"
replace standard_name=" TDK CORPORATION " if trim(standard_name)=="TDK CORP"
replace standard_name=" UNITED STATES SURGICAL CORPORATION " if trim(standard_name)=="U S SURGICAL CORP"
replace standard_name=" W. R. GRACE & CO. " if trim(standard_name)=="GRACE (W R) & CO"
replace standard_name=" WESTINGHOUSE ELECTRIC CORP. " if trim(standard_name)=="WESTINGHOUSE ELEC"

* EPO Espace specific character format problems
* For files downloaded from EPO Espace & appears as &amp; 
* Also recode all common words for "AND" to &
tempvar len
gen `len' =length(standard_name)

replace standard_name = subinstr( standard_name, "&AMP;", " & ", 5)
replace standard_name = subinstr( standard_name, "+", " & ", 5)
replace standard_name = subinstr( standard_name, " AND ", " & ", 5)
replace standard_name = subinstr( standard_name, " ET ", " & ", 5)
replace standard_name = subinstr( standard_name, " UND ", " & ", 5)
replace standard_name = subinstr( standard_name, "&", " & ", 30)     /* BHH - ensure that & is separate word */

** British - specific problem with names that end in (THE) and names that start with
** THE, so remove these
replace standard_name=substr(standard_name, 1, `len'-5) if substr(standard_name, -5, 5)=="(THE)"
replace standard_name=substr(standard_name, 5, .) if substr(standard_name, 1, 4)=="THE "

** strip punctuation 
replace standard_name = subinstr( standard_name, "'",  "", 30)
replace standard_name = subinstr( standard_name, ";",  "", 30) 
replace standard_name = subinstr( standard_name, "^",  "", 30)
replace standard_name = subinstr( standard_name, "<",  "", 30)
replace standard_name = subinstr( standard_name, ".",  "", 30)
replace standard_name = subinstr( standard_name, "`",  "", 30)
replace standard_name = subinstr( standard_name, "_",  "", 30)
replace standard_name = subinstr( standard_name, ">",  "", 30)
replace standard_name = subinstr( standard_name, "''", "", 30)
replace standard_name = subinstr( standard_name, "!",  "", 30)
replace standard_name = subinstr( standard_name, "+",  "", 30)
replace standard_name = subinstr( standard_name, "?",  "", 30)
replace standard_name = subinstr( standard_name, "(",  "", 30)
replace standard_name = subinstr( standard_name, "£",  "", 30)
replace standard_name = subinstr( standard_name, "{",  "", 30)
replace standard_name = subinstr( standard_name, "\",  "", 30)
replace standard_name = subinstr( standard_name, ")",  "", 30)
replace standard_name = subinstr( standard_name, "$",  "", 30)
replace standard_name = subinstr( standard_name, "}",  "", 30)
replace standard_name = subinstr( standard_name, "|",  "", 30)
replace standard_name = subinstr( standard_name, ",",  "", 30)
replace standard_name = subinstr( standard_name, "%",  "", 30)
replace standard_name = subinstr( standard_name, "[",  "", 30)
replace standard_name = subinstr( standard_name, "¦",  "", 30)
replace standard_name = subinstr( standard_name, "*",  "", 30)
replace standard_name = subinstr( standard_name, "]",  "", 30)
replace standard_name = subinstr( standard_name, "/",  " ", 30) 
replace standard_name = subinstr( standard_name, "@",  "", 30)
replace standard_name = subinstr( standard_name, ":",  "", 30)
replace standard_name = subinstr( standard_name, "~",  "", 30)
replace standard_name = subinstr( standard_name, "#",  "", 30)
replace standard_name = subinstr( standard_name, "-",  " ", 30) 
replace standard_name = subinstr( standard_name, "  ", " ", 30) 


* Clean common corporate type identifiers 
* 2) Perform some additional changes
replace standard_name = subinstr( standard_name, " RES & DEV ", " R&D ", 1)
replace standard_name = subinstr( standard_name, " RECH & DEV ", " R&D ", 1)

* 3) Perform some country specific work
* UNITED STATES (most of this is in Derwent)

* UNITED KINGDOM
replace standard_name = subinstr( standard_name, " PUBLIC LIMITED ", " PLC ", 1)
replace standard_name = subinstr( standard_name, " PUBLIC LIABILITY COMPANY ", " PLC ", 1)
replace standard_name = subinstr( standard_name, " HOLDINGS ", " HLDGS ", 1)
replace standard_name = subinstr( standard_name, " HOLDING ", " HLDGS ", 1)
replace standard_name = subinstr( standard_name, " GREAT BRITAIN ", " GB ", 1)
replace standard_name = subinstr( standard_name, " LTD CO ", " CO LTD ", 1)

* SPANISH
replace standard_name = subinstr( standard_name, " SOC LIMITADA ", " SL ", 1)
replace standard_name = subinstr( standard_name, " SOC EN COMMANDITA ", " SC ", 1)
replace standard_name = subinstr( standard_name, " & CIA ", " CO ", 1)

rename standard_name company
label var company "Requestor Establishment (Cleaned, for matching)"
save "$output_data/standardized_surveynames.dta", replace 

* Export a simplified mapping to merge back onto the survey 
statastates, name(state)

keep estab_name company state_abbrev 
drop if missing(estab_name)

duplicates drop 
save "$output_data/standard_name_cw.dta", replace 

* Standardize names of infogroup companies
local chunk_size 150000000        // how many lines to read in at once? 
local this_chunk `chunk_size'

tempfile main 
clear 

local iter 0
while (`this_chunk' + 1 >= `chunk_size') {
import delimited "$input_data/2019_Business_Academic_QCQ.txt", /// 
clear varnames(1) rowrange(`=(`iter' * `chunk_size') + 1':`=(`iter++' + 1) * `chunk_size'')

qui: count    
local this_chunk = `r(N)'

gen raw_company = company
destring longitude, replace force
destring latitude, replace force

drop if missing(company) | missing(latitude) | missing(longitude)

** Standardize names with NBER scripts 
gen standard_name = " "+trim(company)+" "
replace standard_name=upper(standard_name)

replace standard_name = subinstr(standard_name,"-ADR"," ",30)
replace standard_name = subinstr(standard_name,"-ADS"," ",30)
replace standard_name = subinstr(standard_name,"-CL A "," ",30)
replace standard_name = subinstr(standard_name,"-CL B "," ",30)
replace standard_name = subinstr(standard_name,"-CONN "," ",30)
replace standard_name = subinstr(standard_name,"-CONSOLIDATED "," ",30)
replace standard_name = subinstr(standard_name,"-DEL "," ",30)
replace standard_name = subinstr(standard_name,"-DE "," ",30)
replace standard_name = subinstr(standard_name,"-NY SHARES "," ",30)
replace standard_name = subinstr(standard_name,"-OLD "," ",30)
replace standard_name = subinstr(standard_name,"-ORD "," ",30)
replace standard_name = subinstr(standard_name,"-PRE AMEND "," ",30)      /* JB */
replace standard_name = subinstr(standard_name,"-PRE DIVEST "," ",30)     /* JB */
replace standard_name = subinstr(standard_name,"-PREAMEND "," ",30)       /* JB */
replace standard_name = subinstr(standard_name,"-PREDIVEST "," ",30)      /* JB */
replace standard_name = subinstr(standard_name,"-PROJ "," ",30)       /* JB */
replace standard_name = subinstr(standard_name,"-PROJECTED "," ",30)      /* JB */
replace standard_name = subinstr(standard_name,"-PREF "," ",30)       /* JB */
replace standard_name = subinstr(standard_name,"-PRE FASB "," ",30)       /* JB */
replace standard_name = subinstr(standard_name,"-PREFASB "," ",30)        /* JB */
replace standard_name = subinstr(standard_name,"-PRO FORMA "," ",30)
replace standard_name = subinstr(standard_name,"- PRO FORMA "," ",30)
replace standard_name = subinstr(standard_name,"-PRO FORMA1 "," ",30)
replace standard_name = subinstr(standard_name,"-PRO FORMA2 "," ",30)
replace standard_name = subinstr(standard_name,"-PRO FORMA3 "," ",30)
replace standard_name = subinstr(standard_name,"-REDH "," ",30)
replace standard_name = subinstr(standard_name,"-SER A COM "," ",30)
replace standard_name = subinstr(standard_name,"-SER A "," ",30)
replace standard_name = subinstr(standard_name,"-SPN "," ",30)

replace standard_name = subinstr(standard_name," ACCPTNCE "," ACCEPTANCE ",30)
replace standard_name = subinstr(standard_name," BANCORPORATION "," BANCORP ",30)
replace standard_name = subinstr(standard_name," BANCORPORTN "," BANCORP ",30)
replace standard_name = subinstr(standard_name," BANCRP "," BANCORP ",30)
replace standard_name = subinstr(standard_name," BNCSHRS "," BANCSHARES ",30)
replace standard_name = subinstr(standard_name," BRWG "," BREWING ",30)
replace standard_name = subinstr(standard_name," CHEVRONTEXACO "," CHEVRON TEXACO ",30)
replace standard_name = subinstr(standard_name," CHSE "," CHASE ",30)
replace standard_name = subinstr(standard_name," COMMN "," COMMUNICATION ",30)
replace standard_name = subinstr(standard_name," COMMUN "," COMMUNICATION ",30)
replace standard_name = subinstr(standard_name," COMMUNICATNS "," COMMUNICATION ",30)
replace standard_name = subinstr(standard_name," COMMUNICATIONS "," COMMUNICATION ",30)
replace standard_name = subinstr(standard_name," DPT STS "," DEPT STORES ",30)
replace standard_name = subinstr(standard_name," DPT "," DEPT ",30)
replace standard_name = subinstr(standard_name," ENRGY "," ENERGY ",30)
replace standard_name = subinstr(standard_name," FINL "," FINANCIAL ",30)
replace standard_name = subinstr(standard_name," FNCL "," FINANCIAL ",30)
replace standard_name = subinstr(standard_name," GRP "," GROUP ",30)
replace standard_name = subinstr(standard_name," HLDGS "," HOLDINGS ",30)  
replace standard_name = subinstr(standard_name," HLDG "," HOLDING ",30)
replace standard_name = subinstr(standard_name," HLT NTWK "," HEALTH NETWORK ",30)
replace standard_name = subinstr(standard_name," HTLS RES "," HOTELS & RESORTS ",30)
replace standard_name = subinstr(standard_name," HLTH "," HEALTH ",30)
replace standard_name = subinstr(standard_name," INTRTECHNLGY "," INTERTECHNOLOGY ",30)
replace standard_name = subinstr(standard_name," JPMORGAN "," J P MORGAN ",30)
replace standard_name = subinstr(standard_name," MED OPTIC "," MEDICAL OPTICS ",30)
replace standard_name = subinstr(standard_name," MINNESOTA MINING AND MANUFACTURING COMPANY "," 3M COMPANY ",30)
replace standard_name = subinstr(standard_name," NAT RES "," NATURAL RESOURCES ",30)
replace standard_name = subinstr(standard_name," NETWRKS "," NETWORK ",30)
replace standard_name = subinstr(standard_name," PHARMACTICALS "," PHARM ",30)
replace standard_name = subinstr(standard_name," PHARMACT "," PHARM ",30)
replace standard_name = subinstr(standard_name," PPTYS TST "," PROPERTIES TRUST ",30)
replace standard_name = subinstr(standard_name," PPTY "," PROPERTY ",30)
replace standard_name = subinstr(standard_name," PROPERTY TR "," PROPERTY TRUST ",30)
replace standard_name = subinstr(standard_name," PAC RAILWY "," PACIFIC RAILWAY ",30)
replace standard_name = subinstr(standard_name," SEMICONDTR "," SEMICONDUCTOR ",30)
replace standard_name = subinstr(standard_name," SOLU "," SOLUTIONS ",30)
replace standard_name = subinstr(standard_name," ST & ALMN "," STEEL & ALUMINUM ",30)
replace standard_name = subinstr(standard_name," STD "," STANDARD ",30)
replace standard_name = subinstr(standard_name," TECHNOLGS "," TECH ",30)
replace standard_name = subinstr(standard_name," TECHNOL "," TECH ",30)
replace standard_name = subinstr(standard_name," TRANSPORTATN "," TRANSPORTATION ",30)
replace standard_name = subinstr(standard_name," ADVERTSG "," ADVERTISING ",30)  /* JB */
replace standard_name = subinstr(standard_name," ADVNTGE "," ADVANTAGE ",30)     /* JB */
replace standard_name = subinstr(standard_name," AIRLN "," AIRLINES ",30)    /* JB */
replace standard_name = subinstr(standard_name," AIRLS "," AIRLINES ",30)    /* JB */
replace standard_name = subinstr(standard_name," AM "," AMERICA ",30)    /* JB */
replace standard_name = subinstr(standard_name," AMER "," AMERICAN ",30)     /* JB */
replace standard_name = subinstr(standard_name," APPLIAN "," APPLIANCES ",30)    /* JB */
replace standard_name = subinstr(standard_name," APPLICTN "," APPLICATIONS ",30)     /* JB */
replace standard_name = subinstr(standard_name," ARCHTCTS "," ARCHITECTS ",30)   /* JB */
replace standard_name = subinstr(standard_name," ASSD "," ASSOCIATED ",30)   /* JB */
replace standard_name = subinstr(standard_name," ASSOC "," ASSOCIATES ",30)  /* JB */
replace standard_name = subinstr(standard_name," ASSOCS "," ASSOCIATES ",30)     /* JB */
replace standard_name = subinstr(standard_name," ATOMC "," ATOMIC ",30)  /* JB */
replace standard_name = subinstr(standard_name," BANCSH "," BANCSHARES ",30)     /* JB */
replace standard_name = subinstr(standard_name," BANCSHR "," BANCSHARES ",30)    /* JB */
replace standard_name = subinstr(standard_name," BCSHS "," BANCSHARES ",30)  /* JB */
replace standard_name = subinstr(standard_name," BK "," BANK ",30)   /* JB */
replace standard_name = subinstr(standard_name," BLDGS "," BUILDINGS ",30)   /* JB */
replace standard_name = subinstr(standard_name," BROADCASTG "," BROADCASTING ",30)   /* JB */
replace standard_name = subinstr(standard_name," BTLNG "," BOTTLING ",30)    /* JB */
replace standard_name = subinstr(standard_name," CBLVISION "," CABLEVISION ",30)     /* JB */
replace standard_name = subinstr(standard_name," CENTRS "," CENTERS ",30)    /* JB */
replace standard_name = subinstr(standard_name," CHAMPNSHIP "," CHAMPIONSHIP ",30)   /* JB */
replace standard_name = subinstr(standard_name," CMMNCTNS "," COMMUNICATIONS ",30)   /* JB */
replace standard_name = subinstr(standard_name," CNVRSION "," CONVERSION ",30)   /* JB */
replace standard_name = subinstr(standard_name," COFF "," COFFEE ",30)   /* JB */
replace standard_name = subinstr(standard_name," COMM "," COMMUNICATIONS ",30)   /* JB */
replace standard_name = subinstr(standard_name," COMMUN "," COMMUNICATIONS ",30)     /* JB */
replace standard_name = subinstr(standard_name," COMMUNCTN "," COMMUNICATIONS ",30)  /* JB */
replace standard_name = subinstr(standard_name," COMMUNICTNS "," COMMUNICATIONS ",30)    /* JB */
replace standard_name = subinstr(standard_name," COMP "," COMPUTERS ",30)    /* JB */
replace standard_name = subinstr(standard_name," COMPUTR "," COMPUTER ",30)  /* JB */
replace standard_name = subinstr(standard_name," CONFERENCG "," CONFERENCING ",30)   /* JB */
replace standard_name = subinstr(standard_name," CONSTRN "," CONSTR ",30)    /* JB */
replace standard_name = subinstr(standard_name," CONTL "," CONTINENTAL ",30)     /* JB */
replace standard_name = subinstr(standard_name," CONTNT "," CONTINENTAL ",30)    /* JB */
replace standard_name = subinstr(standard_name," CONTRL "," CONTROL ",30)    /* JB */
replace standard_name = subinstr(standard_name," CONTRL "," CONTROL ",30)    /* JB */
replace standard_name = subinstr(standard_name," CTR "," CENTER ",30)    /* JB */
replace standard_name = subinstr(standard_name," CTRS "," CENTERS ",30)  /* JB */
replace standard_name = subinstr(standard_name," CVRGS "," COVERINGS ",30)   /* JB */
replace standard_name = subinstr(standard_name," DEV "," DEVELOPMENT ",30)   /* JB */
replace standard_name = subinstr(standard_name," DEVL "," DEVELOPMENT ",30)  /* JB */
replace standard_name = subinstr(standard_name," DEVLP "," DEVELOPMENT ",30)     /* JB */
replace standard_name = subinstr(standard_name," DISTR "," DISTRIBUTION ",30)    /* JB */
replace standard_name = subinstr(standard_name," DISTRIBUT "," DISTRIBUTION ",30)    /* JB */
replace standard_name = subinstr(standard_name," DISTRIBUTN "," DISTRIBUTION ",30)   /* JB */
replace standard_name = subinstr(standard_name," ELCTRNCS "," ELECTRONICS ",30)  /* JB */
replace standard_name = subinstr(standard_name," ELECTR "," ELECTRONICS ",30)    /* JB */
replace standard_name = subinstr(standard_name," ENGNRD "," ENGINEERED ",30)     /* JB */
replace standard_name = subinstr(standard_name," ENMT "," ENTERTAINMENT ",30)    /* JB */
replace standard_name = subinstr(standard_name," ENTERTAIN "," ENTERTAINMENT ",30)   /* JB */
replace standard_name = subinstr(standard_name," ENTERTNMNT "," ENTERTAINMENT ",30)  /* JB */
replace standard_name = subinstr(standard_name," ENTMNT "," ENTERTAINMENT ",30)  /* JB */
replace standard_name = subinstr(standard_name," ENTMT "," ENTERTAINMENT ",30)   /* JB */
replace standard_name = subinstr(standard_name," ENTRPR "," ENTERPRISES ",30)    /* JB */
replace standard_name = subinstr(standard_name," ENTRPRISE "," ENTERPRISES ",30)     /* JB */
replace standard_name = subinstr(standard_name," ENTRPRS "," ENTERPRISES ",30)   /* JB */
replace standard_name = subinstr(standard_name," ENVIR "," ENVIRONMENTAL ",30)   /* JB */
replace standard_name = subinstr(standard_name," ENVIRNMNTL "," ENVIRONMENTAL ",30)  /* JB */
replace standard_name = subinstr(standard_name," ENVR "," ENVIRONMENTAL ",30)    /* JB */
replace standard_name = subinstr(standard_name," EQUIPMT "," EQUIPMENT ",30)     /* JB */
replace standard_name = subinstr(standard_name," EXCHG "," EXCHANGE ",30)    /* JB */
replace standard_name = subinstr(standard_name," EXPLOR "," EXPLORATION ",30)    /* JB */
replace standard_name = subinstr(standard_name," FNDG "," FUNDING ",30)  /* JB */
replace standard_name = subinstr(standard_name," GLD "," GOLD ",30)  /* JB */
replace standard_name = subinstr(standard_name," GP "," GROUP ",30)  /* JB */
replace standard_name = subinstr(standard_name," HLDS "," HLDGS ",30)    /* JB */
replace standard_name = subinstr(standard_name," HLTHCARE "," HEALTHCARE ",30)   /* JB */
replace standard_name = subinstr(standard_name," HLTHCR "," HEALTHCARE ",30)     /* JB */
replace standard_name = subinstr(standard_name," HOMEMDE "," HOMEMADE ",30)  /* JB */
replace standard_name = subinstr(standard_name," HSPTL "," HOSPITAL ",30)    /* JB */
replace standard_name = subinstr(standard_name," ILLUM "," ILLUMINATION ",30)    /* JB */
replace standard_name = subinstr(standard_name," INDL "," INDUSTRIAL ",30)   /* JB */
replace standard_name = subinstr(standard_name," INDPT "," INDEPENDENT ",30)     /* JB */
replace standard_name = subinstr(standard_name," INDTY "," INDEMNITY ",30)   /* JB */
replace standard_name = subinstr(standard_name," INFORMATN "," INFO ",30)    /* JB */
replace standard_name = subinstr(standard_name," INSTNS "," INSTITUTIONS ",30)   /* JB */
replace standard_name = subinstr(standard_name," INSTRUMEN "," INSTRUMENTS ",30)     /* JB */
replace standard_name = subinstr(standard_name," INSTRUMNT "," INSTRUMENTS ",30)     /* JB */
replace standard_name = subinstr(standard_name," INTEGRATRS "," INTEGRATORS ",30)    /* JB */
replace standard_name = subinstr(standard_name," INTERNATIONL "," INT ",30)  /* JB */
replace standard_name = subinstr(standard_name," INVS "," INVESTMENTS ",30)  /* JB */
replace standard_name = subinstr(standard_name," INVT "," INVESTMENT ",30)   /* JB */
replace standard_name = subinstr(standard_name," MANAGEMNT "," MANAGEMENT ",30)  /* JB */
replace standard_name = subinstr(standard_name," MANAGMNT "," MANAGEMENT ",30)   /* JB */
replace standard_name = subinstr(standard_name," MANHATN "," MANHATTAN ",30)     /* JB */
replace standard_name = subinstr(standard_name," MANUF "," MFG ",30)     /* JB */
replace standard_name = subinstr(standard_name," MDSE "," MERCHANDISING ",30)    /* JB */
replace standard_name = subinstr(standard_name," MEASURMNT "," MEASUREMENT ",30)     /* JB */
replace standard_name = subinstr(standard_name," MERCHNDSNG "," MERCHANDISING ",30)  /* JB */
replace standard_name = subinstr(standard_name," MGMT "," MANAGEMENT ",30)   /* JB */
replace standard_name = subinstr(standard_name," MGRS "," MANAGERS ",30)     /* JB */
replace standard_name = subinstr(standard_name," MGT "," MANAGEMENT ",30)    /* JB */
replace standard_name = subinstr(standard_name," MICROWAV "," MICROWAVE ",30)    /* JB */
replace standard_name = subinstr(standard_name," MKTS "," MARKETS ",30)  /* JB */
replace standard_name = subinstr(standard_name," MLTIMEDIA "," MULTIMEDIA ",30)  /* JB */
replace standard_name = subinstr(standard_name," MTG "," MORTGAGE ",30)  /* JB */
replace standard_name = subinstr(standard_name," MTNS "," MOUTAINS ",30)     /* JB */
replace standard_name = subinstr(standard_name," MTRS "," MOTORS ",30)   /* JB */
replace standard_name = subinstr(standard_name," NETWRK "," NETWORK ",30)    /* JB */
replace standard_name = subinstr(standard_name," NOWEST "," NORTHWEST ",30)  /* JB */
replace standard_name = subinstr(standard_name," NTWRK "," NETWORK ",30)     /* JB */
replace standard_name = subinstr(standard_name," OFFSHRE "," OFFSHORE ",30)  /* JB */
replace standard_name = subinstr(standard_name," ORGANIZTN "," ORG ",30)     /* JB */
replace standard_name = subinstr(standard_name," PBLG "," PUBLISHING ",30)   /* JB */
replace standard_name = subinstr(standard_name," PHARMACEUTICL "," PHARM ",30)   /* JB */
replace standard_name = subinstr(standard_name," PLAST "," PLASTICS ",30)    /* JB */
replace standard_name = subinstr(standard_name," PPTYS "," PROPERTIES ",30)  /* JB */
replace standard_name = subinstr(standard_name," PRODS "," PROD ",30)    /* JB */
replace standard_name = subinstr(standard_name," PRODTN "," PRODN ",30)  /* JB */
replace standard_name = subinstr(standard_name," PRODUCTN "," PRODN ",30)    /* JB */
replace standard_name = subinstr(standard_name," PRPANE "," PROPANE ",30)    /* JB */
replace standard_name = subinstr(standard_name," PTS "," PARTS ",30)     /* JB */
replace standard_name = subinstr(standard_name," PUBLISH "," PUBLISHING ",30)    /* JB */
replace standard_name = subinstr(standard_name," PUBLSHING "," PUBLISHING ",30)  /* JB */
replace standard_name = subinstr(standard_name," PUBN "," PUBLICATIONS ",30)     /* JB */
replace standard_name = subinstr(standard_name," PUBNS "," PUBLICATIONS ",30)    /* JB */
replace standard_name = subinstr(standard_name," PWR "," POWER ",30)     /* JB */
replace standard_name = subinstr(standard_name," RAILRD "," RAILROAD ",30)   /* JB */
replace standard_name = subinstr(standard_name," RECREATN "," RECREATION ",30)   /* JB */
replace standard_name = subinstr(standard_name," RECYCL "," RECYCLING ",30)  /* JB */
replace standard_name = subinstr(standard_name," REFIN "," REFINING ",30)    /* JB */
replace standard_name = subinstr(standard_name," REFNG "," REFINING ",30)    /* JB */
replace standard_name = subinstr(standard_name," RESTR "," RESTAURANT ",30)  /* JB */
replace standard_name = subinstr(standard_name," RESTS "," RESTAURANTS ",30)     /* JB */
replace standard_name = subinstr(standard_name," RETAILNG "," RETAILING ",30)    /* JB */
replace standard_name = subinstr(standard_name," RLTY "," REALTY ",30)   /* JB */
replace standard_name = subinstr(standard_name," RR "," RAILROAD ",30)   /* JB */
replace standard_name = subinstr(standard_name," RSCH "," RESEARCH ",30)     /* JB */
replace standard_name = subinstr(standard_name," RTNG "," RATING ",30)   /* JB */
replace standard_name = subinstr(standard_name," SCIENTIF "," SCIENTIFIC ",30)   /* JB */
replace standard_name = subinstr(standard_name," SERV "," SERVICES ",30)     /* JB */
replace standard_name = subinstr(standard_name," SLTNS "," SOLUTIONS ",30)   /* JB */
replace standard_name = subinstr(standard_name," SOFTWRE "," SOFTWARE ",30)  /* JB */
replace standard_name = subinstr(standard_name," SOLTNS "," SOLUTIONS ",30)  /* JB */
replace standard_name = subinstr(standard_name," SOLUT "," SOLUTIONS ",30)   /* JB */
replace standard_name = subinstr(standard_name," SRVC "," SERVICES ",30)     /* JB */
replace standard_name = subinstr(standard_name," SRVCS "," SERVICES ",30)    /* JB */
replace standard_name = subinstr(standard_name," STEAKHSE "," STEAKHOUSE ",30)   /* JB */
replace standard_name = subinstr(standard_name," STHWST "," SOUTHWEST ",30)  /* JB */
replace standard_name = subinstr(standard_name," STL "," STEEL ",30)     /* JB */
replace standard_name = subinstr(standard_name," STRS "," STORES ",30)   /* JB */
replace standard_name = subinstr(standard_name," SUP "," SUPPLY ",30)    /* JB */
replace standard_name = subinstr(standard_name," SUPERMKTS "," SUPERMARKETS ",30)    /* JB */
replace standard_name = subinstr(standard_name," SUPP "," SUPPLIES ",30)     /* JB */
replace standard_name = subinstr(standard_name," SURVYS "," SURVEYS ",30)    /* JB */
replace standard_name = subinstr(standard_name," SVC "," SERVICES ",30)  /* JB */
replace standard_name = subinstr(standard_name," SVCS "," SERVICES ",30)     /* JB */
replace standard_name = subinstr(standard_name," SVSC "," SERVICES ",30)     /* JB */
replace standard_name = subinstr(standard_name," SYS "," SYSTEMS ",30)   /* JB */
replace standard_name = subinstr(standard_name," SYSTM "," SYSTEMS ",30)     /* JB */
replace standard_name = subinstr(standard_name," TCHNLGY "," TECH ",30)  /* JB */
replace standard_name = subinstr(standard_name," TECHNGS "," TECHNOLOGIES ",30)  /* JB */
replace standard_name = subinstr(standard_name," TECHNL "," TECH ",30)   /* JB */
replace standard_name = subinstr(standard_name," TECHNLGIES "," TECHNOLOGIES ",30)   /* JB */
replace standard_name = subinstr(standard_name," TEL "," TELEPHONE ",30)     /* JB */
replace standard_name = subinstr(standard_name," TELE-COMM "," TELECOMMUNICATIONS ",30)  /* JB */
replace standard_name = subinstr(standard_name," TELE-COMMUN "," TELECOMMUNICATIONS ",30)    /* JB */
replace standard_name = subinstr(standard_name," TELECOMMS "," TELECOMMUNICATIONS ",30)  /* JB */
replace standard_name = subinstr(standard_name," TELECONFERENC "," TELECONFERENCING ",30)    /* JB */
replace standard_name = subinstr(standard_name," TELEG "," TELEGRAPH ",30)   /* JB */
replace standard_name = subinstr(standard_name," TELEGR "," TELEGRAPH ",30)  /* JB */
replace standard_name = subinstr(standard_name," TELVSN "," TELEVISION ",30)     /* JB */
replace standard_name = subinstr(standard_name," TR "," TRUST ",30)  /* JB */
replace standard_name = subinstr(standard_name," TRANSN "," TRANSPORTATION ",30)     /* JB */
replace standard_name = subinstr(standard_name," TRANSPORTN "," TRANSPORTATION ",30)     /* JB */
replace standard_name = subinstr(standard_name," TRNSACTN "," TRANSACTION ",30)  /* JB */
replace standard_name = subinstr(standard_name," UTD "," UNITED ",30)    /* JB */
replace standard_name = subinstr(standard_name," WSTN "," WESTERN ",30)  /* JB */
replace standard_name = subinstr(standard_name," WTR "," WATER ",30)     /* JB */
replace standard_name=" U.S. PHILIPS CORPORATION " if trim(standard_name)=="NORTH AMERICAN PHILIPS CORP"
replace standard_name=" A. L. WILLIAMS CORP. " if trim(standard_name)=="WILLIAMS (A.L.) CORP"
replace standard_name=" B. F. GOODRICH CO. " if trim(standard_name)=="GOODRICH CORP"
replace standard_name=" BELL + HOWELL COMPANY " if trim(standard_name)=="BELL & HOWELL OPERATING CO"
replace standard_name=" BENDIX CORPORATION(NOW ALLIED-SIGNAL INC.) " if trim(standard_name)=="BENDIX CORP"
replace standard_name=" BORG-WARNER CORPORATION " if trim(standard_name)=="BORGWARNER INC"
replace standard_name=" CHRYSLER MOTORS CORPORATION " if trim(standard_name)=="CHRYSLER CORP"
replace standard_name=" CISCO TECHNOLOGY, INC. " if trim(standard_name)=="CISCO SYSTEMS INC"
replace standard_name=" DELL PRODUCTS, L.P. " if trim(standard_name)=="DELL INC"
replace standard_name=" DELPHI TECHNOLOGIES, INC. " if trim(standard_name)=="DELPHI CORP"
replace standard_name=" E. I. DU PONT DE NEMOURS AND COMPANY " if trim(standard_name)=="DU PONT (E I) DE NEMOURS"
replace standard_name=" E. R. SQUIBB + SONS, INC. " if trim(standard_name)=="SQUIBB CORP"
replace standard_name=" ELI LILLY AND COMPANY " if trim(standard_name)=="LILLY (ELI) & CO"
replace standard_name=" G. D. SEARLE & CO. " if trim(standard_name)=="SEARLE (G.D.) & CO"
replace standard_name=" MINNESOTA MINING AND MANUFACTURING COMPANY " if trim(standard_name)=="3M CO"
replace standard_name=" OWENS-CORNING FIBERGLAS CORPORATION " if trim(standard_name)=="OWENS CORNING"
replace standard_name=" SCHLUMBERGER TECHNOLOGY CORPORATION " if trim(standard_name)=="SCHLUMBERGER LTD"
replace standard_name=" SCI-MED LIFE SYSTEMS, INC. " if trim(standard_name)=="SICMED LIFE SYSTEMS"
replace standard_name=" TDK CORPORATION " if trim(standard_name)=="TDK CORP"
replace standard_name=" UNITED STATES SURGICAL CORPORATION " if trim(standard_name)=="U S SURGICAL CORP"
replace standard_name=" W. R. GRACE & CO. " if trim(standard_name)=="GRACE (W R) & CO"
replace standard_name=" WESTINGHOUSE ELECTRIC CORP. " if trim(standard_name)=="WESTINGHOUSE ELEC"

** EPO Espace specific character format problems
** For files downloaded from EPO Espace & appears as &amp; 
** Also recode all common words for "AND" to &
tempvar len
gen `len' =length(standard_name)

replace standard_name = subinstr( standard_name, "&AMP;", " & ", 5)
replace standard_name = subinstr( standard_name, "+", " & ", 5)
replace standard_name = subinstr( standard_name, " AND ", " & ", 5)
replace standard_name = subinstr( standard_name, " ET ", " & ", 5)
replace standard_name = subinstr( standard_name, " UND ", " & ", 5)
replace standard_name = subinstr( standard_name, "&", " & ",30)     /* BHH - ensure that & is separate word */

** British - specific problem with names that end in (THE) and names that start with
** THE, so remove these
replace standard_name=substr(standard_name, 1, `len'-5) if substr(standard_name, -5, 5)=="(THE)"
replace standard_name=substr(standard_name, 5, .) if substr(standard_name, 1, 4)=="THE "

** strip punctuation 
replace standard_name = subinstr( standard_name, "'",  "", 30)
replace standard_name = subinstr( standard_name, ";",  "", 30) 
replace standard_name = subinstr( standard_name, "^",  "", 30)
replace standard_name = subinstr( standard_name, "<",  "", 30)
replace standard_name = subinstr( standard_name, ".",  "", 30)
replace standard_name = subinstr( standard_name, "`",  "", 30)
replace standard_name = subinstr( standard_name, "_",  "", 30)
replace standard_name = subinstr( standard_name, ">",  "", 30)
replace standard_name = subinstr( standard_name, "''", "", 30)
replace standard_name = subinstr( standard_name, "!",  "", 30)
replace standard_name = subinstr( standard_name, "+",  "", 30)
replace standard_name = subinstr( standard_name, "?",  "", 30)
replace standard_name = subinstr( standard_name, "(",  "", 30)
replace standard_name = subinstr( standard_name, "£",  "", 30)
replace standard_name = subinstr( standard_name, "{",  "", 30)
replace standard_name = subinstr( standard_name, "\",  "", 30)
replace standard_name = subinstr( standard_name, ")",  "", 30)
replace standard_name = subinstr( standard_name, "$",  "", 30)
replace standard_name = subinstr( standard_name, "}",  "", 30)
replace standard_name = subinstr( standard_name, "|",  "", 30)
replace standard_name = subinstr( standard_name, ",",  "", 30)
replace standard_name = subinstr( standard_name, "%",  "", 30)
replace standard_name = subinstr( standard_name, "[",  "", 30)
replace standard_name = subinstr( standard_name, "¦",  "", 30)
replace standard_name = subinstr( standard_name, "*",  "", 30)
replace standard_name = subinstr( standard_name, "]",  "", 30)
replace standard_name = subinstr( standard_name, "/",  " ", 30) 
replace standard_name = subinstr( standard_name, "@",  "", 30)
replace standard_name = subinstr( standard_name, ":",  "", 30)
replace standard_name = subinstr( standard_name, "~",  "", 30)
replace standard_name = subinstr( standard_name, "#",  "", 30)
replace standard_name = subinstr( standard_name, "-",  " ", 30) 
replace standard_name = subinstr( standard_name, "  ", " ", 30) 


************** clean common corporate type identifiers 
** 2) Perform some additional changes
replace standard_name = subinstr( standard_name, " RES & DEV ", " R&D ", 1)
replace standard_name = subinstr( standard_name, " RECH & DEV ", " R&D ", 1)

** 3) Perform some country specific work
** UNITED STATES (most of this is in Derwent)

** UNITED KINGDOM
replace standard_name = subinstr( standard_name, " PUBLIC LIMITED ", " PLC ", 1)
replace standard_name = subinstr( standard_name, " PUBLIC LIABILITY COMPANY ", " PLC ", 1)
replace standard_name = subinstr( standard_name, " HOLDINGS ", " HLDGS ", 1)
replace standard_name = subinstr( standard_name, " HOLDING ", " HLDGS ", 1)
replace standard_name = subinstr( standard_name, " GREAT BRITAIN ", " GB ", 1)
replace standard_name = subinstr( standard_name, " LTD CO ", " CO LTD ", 1)

** SPANISH
replace standard_name = subinstr( standard_name, " SOC LIMITADA ", " SL ", 1)
replace standard_name = subinstr( standard_name, " SOC EN COMMANDITA ", " SC ", 1)
replace standard_name = subinstr( standard_name, " & CIA ", " CO ", 1)
replace company =standard_name  

if `iter' > 1 append using `main'
save `main', replace  
di "Finished iteration `iter'"
}

export delimited "$output_data/infogroup_stdnames.csv", replace

/*

* merge the survey and infogroup names chunk by chunk
use "$output_data/standard_name_cw.dta", clear 
cap rename state_abbrev state 

tempfile survey best_matches
save `survey', replace  

local chunk_size 75000        // how many lines to read in at once? 
local this_chunk `chunk_size'

clear 

local iter 0
while (`this_chunk' + 1 >= `chunk_size') {
	import delimited "$output_data/infogroup_stdnames.csv", /// 
	clear varnames(1) rowrange(`=(`iter' * `chunk_size') + 1':`=(`iter++' + 1) * `chunk_size'')

	qui: count    
	local this_chunk = `r(N)'
	assert company == standard_name
	drop company      
	
	joinby state using `survey'    
	
	ustrdist company standard_name, gen(name_dist)
	
	gen tie_break = runiform()

	if `iter' > 1 { 
		append using `best_matches'
		local cmd replace 
	} 
	else {
		local cmd gen 
	}

	bysort company state (name_dist tie_break): `cmd' best = _n == 1
	bysort company state name_dist (tie_break): `cmd' tied = _N > 1

	keep if best 

	order company estab_name standard_name state name_dist tie_break

	save `best_matches', replace 
	
	di "Finished iteration `iter'" _n
}

save "$output_data/infogroup_data.dta", replace 

*/
/*==================================================================
Clean cps data to merge in local industry unemployment rates and low skill unemployment rates
==================================================================*/
clear
quietly infix              ///
int     year      1-4      ///
long    serial    5-9      ///
byte    month     10-11    ///
double  hwtfinl   12-21    ///
double  cpsid     22-35    ///
byte    asecflag  36-36    ///
double  asecwth   37-46    ///
byte    statefip  47-48    ///
long    county    49-53    ///
long    metfips   54-58    ///
byte    pernum    59-60    ///
double  wtfinl    61-74    ///
double  cpsidp    75-88    ///
double  asecwt    89-98    ///
byte    empstat   99-100   ///
int     ind       101-104  ///
int     educ      105-107  ///
byte    eddipged  108-109  ///
using `"$input_data/cps_00002.dat"'

replace hwtfinl  = hwtfinl / 10000
replace asecwth  = asecwth / 10000
replace wtfinl   = wtfinl  / 10000
replace asecwt   = asecwt  / 10000

format hwtfinl  %10.4f
format cpsid    %14.0f
format asecwth  %10.4f
format wtfinl   %14.4f
format cpsidp   %14.0f
format asecwt   %10.4f

label var year     `"Survey year"'
label var serial   `"Household serial number"'
label var month    `"Month"'
label var hwtfinl  `"Household weight, Basic Monthly"'
label var cpsid    `"CPSID, household record"'
label var asecflag `"Flag for ASEC"'
label var asecwth  `"Annual Social and Economic Supplement Household weight"'
label var statefip `"State (FIPS code)"'
label var county   `"FIPS county code"'
label var metfips  `"Metropolitan area FIPS code"'
label var pernum   `"Person number in sample unit"'
label var wtfinl   `"Final Basic Weight"'
label var cpsidp   `"CPSID, person record"'
label var asecwt   `"Annual Social and Economic Supplement Weight"'
label var empstat  `"Employment status"'
label var ind      `"Industry"'
label var educ     `"Educational attainment recode"'
label var eddipged `"High school or GED"'

label define month_lbl 01 `"January"'
label define month_lbl 02 `"February"', add
label define month_lbl 03 `"March"', add
label define month_lbl 04 `"April"', add
label define month_lbl 05 `"May"', add
label define month_lbl 06 `"June"', add
label define month_lbl 07 `"July"', add
label define month_lbl 08 `"August"', add
label define month_lbl 09 `"September"', add
label define month_lbl 10 `"October"', add
label define month_lbl 11 `"November"', add
label define month_lbl 12 `"December"', add
label values month month_lbl

label define asecflag_lbl 1 `"ASEC"'
label define asecflag_lbl 2 `"March Basic"', add
label values asecflag asecflag_lbl

label define statefip_lbl 01 `"Alabama"'
label define statefip_lbl 02 `"Alaska"', add
label define statefip_lbl 04 `"Arizona"', add
label define statefip_lbl 05 `"Arkansas"', add
label define statefip_lbl 06 `"California"', add
label define statefip_lbl 08 `"Colorado"', add
label define statefip_lbl 09 `"Connecticut"', add
label define statefip_lbl 10 `"Delaware"', add
label define statefip_lbl 11 `"District of Columbia"', add
label define statefip_lbl 12 `"Florida"', add
label define statefip_lbl 13 `"Georgia"', add
label define statefip_lbl 15 `"Hawaii"', add
label define statefip_lbl 16 `"Idaho"', add
label define statefip_lbl 17 `"Illinois"', add
label define statefip_lbl 18 `"Indiana"', add
label define statefip_lbl 19 `"Iowa"', add
label define statefip_lbl 20 `"Kansas"', add
label define statefip_lbl 21 `"Kentucky"', add
label define statefip_lbl 22 `"Louisiana"', add
label define statefip_lbl 23 `"Maine"', add
label define statefip_lbl 24 `"Maryland"', add
label define statefip_lbl 25 `"Massachusetts"', add
label define statefip_lbl 26 `"Michigan"', add
label define statefip_lbl 27 `"Minnesota"', add
label define statefip_lbl 28 `"Mississippi"', add
label define statefip_lbl 29 `"Missouri"', add
label define statefip_lbl 30 `"Montana"', add
label define statefip_lbl 31 `"Nebraska"', add
label define statefip_lbl 32 `"Nevada"', add
label define statefip_lbl 33 `"New Hampshire"', add
label define statefip_lbl 34 `"New Jersey"', add
label define statefip_lbl 35 `"New Mexico"', add
label define statefip_lbl 36 `"New York"', add
label define statefip_lbl 37 `"North Carolina"', add
label define statefip_lbl 38 `"North Dakota"', add
label define statefip_lbl 39 `"Ohio"', add
label define statefip_lbl 40 `"Oklahoma"', add
label define statefip_lbl 41 `"Oregon"', add
label define statefip_lbl 42 `"Pennsylvania"', add
label define statefip_lbl 44 `"Rhode Island"', add
label define statefip_lbl 45 `"South Carolina"', add
label define statefip_lbl 46 `"South Dakota"', add
label define statefip_lbl 47 `"Tennessee"', add
label define statefip_lbl 48 `"Texas"', add
label define statefip_lbl 49 `"Utah"', add
label define statefip_lbl 50 `"Vermont"', add
label define statefip_lbl 51 `"Virginia"', add
label define statefip_lbl 53 `"Washington"', add
label define statefip_lbl 54 `"West Virginia"', add
label define statefip_lbl 55 `"Wisconsin"', add
label define statefip_lbl 56 `"Wyoming"', add
label define statefip_lbl 61 `"Maine-New Hampshire-Vermont"', add
label define statefip_lbl 65 `"Montana-Idaho-Wyoming"', add
label define statefip_lbl 68 `"Alaska-Hawaii"', add
label define statefip_lbl 69 `"Nebraska-North Dakota-South Dakota"', add
label define statefip_lbl 70 `"Maine-Massachusetts-New Hampshire-Rhode Island-Vermont"', add
label define statefip_lbl 71 `"Michigan-Wisconsin"', add
label define statefip_lbl 72 `"Minnesota-Iowa"', add
label define statefip_lbl 73 `"Nebraska-North Dakota-South Dakota-Kansas"', add
label define statefip_lbl 74 `"Delaware-Virginia"', add
label define statefip_lbl 75 `"North Carolina-South Carolina"', add
label define statefip_lbl 76 `"Alabama-Mississippi"', add
label define statefip_lbl 77 `"Arkansas-Oklahoma"', add
label define statefip_lbl 78 `"Arizona-New Mexico-Colorado"', add
label define statefip_lbl 79 `"Idaho-Wyoming-Utah-Montana-Nevada"', add
label define statefip_lbl 80 `"Alaska-Washington-Hawaii"', add
label define statefip_lbl 81 `"New Hampshire-Maine-Vermont-Rhode Island"', add
label define statefip_lbl 83 `"South Carolina-Georgia"', add
label define statefip_lbl 84 `"Kentucky-Tennessee"', add
label define statefip_lbl 85 `"Arkansas-Louisiana-Oklahoma"', add
label define statefip_lbl 87 `"Iowa-N Dakota-S Dakota-Nebraska-Kansas-Minnesota-Missouri"', add
label define statefip_lbl 88 `"Washington-Oregon-Alaska-Hawaii"', add
label define statefip_lbl 89 `"Montana-Wyoming-Colorado-New Mexico-Utah-Nevada-Arizona"', add
label define statefip_lbl 90 `"Delaware-Maryland-Virginia-West Virginia"', add
label define statefip_lbl 99 `"State not identified"', add
label values statefip statefip_lbl

label define empstat_lbl 00 `"NIU"'
label define empstat_lbl 01 `"Armed Forces"', add
label define empstat_lbl 10 `"At work"', add
label define empstat_lbl 12 `"Has job, not at work last week"', add
label define empstat_lbl 20 `"Unemployed"', add
label define empstat_lbl 21 `"Unemployed, experienced worker"', add
label define empstat_lbl 22 `"Unemployed, new worker"', add
label define empstat_lbl 30 `"Not in labor force"', add
label define empstat_lbl 31 `"NILF, housework"', add
label define empstat_lbl 32 `"NILF, unable to work"', add
label define empstat_lbl 33 `"NILF, school"', add
label define empstat_lbl 34 `"NILF, other"', add
label define empstat_lbl 35 `"NILF, unpaid, lt 15 hours"', add
label define empstat_lbl 36 `"NILF, retired"', add
label values empstat empstat_lbl

label define educ_lbl 000 `"NIU or no schooling"'
label define educ_lbl 001 `"NIU or blank"', add
label define educ_lbl 002 `"None or preschool"', add
label define educ_lbl 010 `"Grades 1, 2, 3, or 4"', add
label define educ_lbl 011 `"Grade 1"', add
label define educ_lbl 012 `"Grade 2"', add
label define educ_lbl 013 `"Grade 3"', add
label define educ_lbl 014 `"Grade 4"', add
label define educ_lbl 020 `"Grades 5 or 6"', add
label define educ_lbl 021 `"Grade 5"', add
label define educ_lbl 022 `"Grade 6"', add
label define educ_lbl 030 `"Grades 7 or 8"', add
label define educ_lbl 031 `"Grade 7"', add
label define educ_lbl 032 `"Grade 8"', add
label define educ_lbl 040 `"Grade 9"', add
label define educ_lbl 050 `"Grade 10"', add
label define educ_lbl 060 `"Grade 11"', add
label define educ_lbl 070 `"Grade 12"', add
label define educ_lbl 071 `"12th grade, no diploma"', add
label define educ_lbl 072 `"12th grade, diploma unclear"', add
label define educ_lbl 073 `"High school diploma or equivalent"', add
label define educ_lbl 080 `"1 year of college"', add
label define educ_lbl 081 `"Some college but no degree"', add
label define educ_lbl 090 `"2 years of college"', add
label define educ_lbl 091 `"Associate's degree, occupational/vocational program"', add
label define educ_lbl 092 `"Associate's degree, academic program"', add
label define educ_lbl 100 `"3 years of college"', add
label define educ_lbl 110 `"4 years of college"', add
label define educ_lbl 111 `"Bachelor's degree"', add
label define educ_lbl 120 `"5+ years of college"', add
label define educ_lbl 121 `"5 years of college"', add
label define educ_lbl 122 `"6+ years of college"', add
label define educ_lbl 123 `"Master's degree"', add
label define educ_lbl 124 `"Professional school degree"', add
label define educ_lbl 125 `"Doctorate degree"', add
label define educ_lbl 999 `"Missing/Unknown"', add
label values educ educ_lbl

label define eddipged_lbl 01 `"Graduated from high school"'
label define eddipged_lbl 02 `"GED or other equivilent"', add
label define eddipged_lbl 99 `"NIU"', add
label values eddipged eddipged_lbl

save "$output_data/cps.dta", replace

* Create different versions of unemployment rate using the cps data
use "$output_data/cps.dta", clear
rename county combined_fips

* Restrict to January-March 2020
keep if year == 2020
keep if month == 1 | month == 2 | month == 3

* Indicator for unemployed if empstat is "Unemployed, experienced worker" or "Unemployed, new worker"
gen unemployed = 1 if empstat == 20 | empstat == 21 | empstat == 22 

* Indicator for employed if empstat is "At work" or "Has job, not at work last week"
gen employed = 1 if empstat == 10 | empstat == 12 

* Re-code industries based on code list from the US Census: https://www2.census.gov/programs-surveys/demo/guidance/industry-occupation/2017-industry-code-list-with-crosswalk.xlsx
gen industry = "Agriculture, Forestry \& Fishing" if ind >= 170 & ind <= 290
replace industry = "Mining" if ind >= 370 & ind <= 490
replace industry = "Construction" if ind == 770
replace industry = "Manufacturing" if ind >= 1070 & ind <= 3990
replace industry = "Transportation \& Public Utilities" if (ind >= 6070 & ind <= 6390) | (ind >= 570 & ind <= 690)
replace industry = "Retail" if ind >= 4670 & ind <= 5790
replace industry = "Wholesale Trade" if ind >= 4070 & ind <= 4590
replace industry = "Finance, Insurance, \& Real Estate" if ind >= 6870 & ind <= 7190
replace industry = "Service" if ind >= 7270 & ind <= 9290
replace industry = "Public Administration" if ind >= 9370 & ind <= 9590
replace industry = "Nonclassifiable" if ind >= 9670
keep if !missing(industry)  

* Indicator for attaining high school or less
gen hs_or_below = educ <= 73 

* Create unemployment rate by metropolitan area and industry
preserve
collapse (sum) unemployed employed if asecflag != 1 [pweight=wtfinl], by(metfips industry)
gen local_ind_ue_rate = unemployed / (unemployed + employed)
keep metfips industry local_ind_ue_rate
save "$output_data/cps_local_ind_ue_rate_jan2020_mar2020.dta", replace
restore

* Create unemployment rate by metropolitan area for people with high school degrees or below
preserve
collapse (sum) unemployed employed if asecflag != 1 [pweight=wtfinl], by(metfips hs_or_below)
keep if hs_or_below == 1
gen local_low_skill_ue_rate = unemployed / (unemployed + employed)
keep metfips local_low_skill_ue_rate
save "$output_data/cps_low_skill_ue_rate_jan2020_mar2020.dta", replace
restore

* Create unemployment rate by metropolitan area
preserve
collapse (sum) unemployed employed if asecflag != 1 [pweight=wtfinl], by(metfips)
gen met_ue_rate = unemployed / (unemployed + employed)
keep metfips met_ue_rate
save "$output_data/cps_met_ue_rate_jan2020_mar2020.dta", replace
restore


/*==================================================================
*** COMBINE DATA ***
==================================================================*/

use "$output_data/combined_distributions.dta", clear 

** Get the firm info from the initial distribution 
merge m:1 email using "$input_data/firm_covariates.dta"
assert _m != 1  if !postcovid       // we should have everyone that we distributed to   
					//  the new distribution people will be missing from this 
keep if _m == 3                     // only keep people who responded 
drop _m                   

** Get additional information from Platform admin data 
merge m:1 email using "$output_data/platform_universe.dta", keep(1 3) nogen keepusing(empsize platform_tenure industry)
merge m:1 email using "$output_data/platform_requestor_info.dta", keep(1 3) nogen 


** Merge on Infogroup data 
* First get the standardized company name 
statastates, name(state) nogen

merge m:1 estab_name state_abbrev using "$output_data/infogroup_data.dta", nogen keep(1 3)

preserve 
import excel "$input_data/merge_manual_review.xlsx", clear firstrow
duplicates drop

tempfile match_approved
save `match_approved', replace
restore 

merge m:1 standard_name company using `match_approved', nogen keep(1 3)

gen company_raw = company
foreach var in sic sic_desc estab_year emp_size_bin{
cap: replace `var'  = "" if match_approved != 1 
cap: replace `var'  = . if match_approved != 1 
assert missing(`var') if match_approved != 1 
}

**** Merge on covid data 
replace county = "San Francisco County" if county == "San Francisco"
replace state = proper(state)

gen name = county + ", " + state

merge m:1 name using "$input_data/covid_county_HBS.dta", keepusing(popdensity2010 population_total cases median_cases_pc quartile_cases_pc county_fips) keep(1 3) nogen  
merge m:1 name using "$input_data/county_demographics.dta", keep(1 3) nogen 
merge m:1 name using "input_data/county_crime.dta", keep(1 3) nogen 
gen m_white = median_white_county
gen q_white = quartile_white_county
gen m_black = median_black_county
gen q_black = quartile_black_county
rename county_fips combined_fips 

* Prepare and merge in Metropolitan FIPS code using crosswalk
preserve
use "$input_data/cbsa2fipsxw.dta", clear
rename cbsacode metfips
destring metfips, replace
gen combined_fips = fipsstatecode + fipscountycode
destring combined_fips, replace
tostring combined_fips, replace
keep combined_fips metfips 
tempfile metcrosswalk
save `metcrosswalk'
restore 

merge m:1 combined_fips using `metcrosswalk', keep(1 3) nogen

* Merge in unemployment rate by county data
merge m:1 combined_fips using "$input_data/Mar2020_ue_by_county.dta", keep(1 3) nogen
gen county_ue_rate = unemp_lvl/labor_force 
drop name

* log of cases per population 
gen ln_cases_pc = ln((cases + 1)/population_total)

* log of population density in 2020
gen ln_popden = ln(popdensity2010) 

* Indicator for state of emergency declaration due to COVID-19
gen state_of_emergency = response_date >= date("3/13/2020", "MDY") if !missing(response_date)


sleep 2000
save "$output_data/combined_distributions.dta", replace 

sleep 2000

** additional cleaning to prepare for analysis
clean_survey using "$output_data/combined_distributions.dta" , saveas("$output_data/main_survey.dta") replace 

use "$output_data/main_survey_wide.dta", clear
merge m:1 metfips industry using "$output_data/cps_local_ind_ue_rate_jan2020_mar2020.dta", keep(1 3) nogen 
merge m:1 metfips using "$output_data/cps_low_skill_ue_rate_jan2020_mar2020.dta", keep(1 3) nogen 
merge m:1 metfips using "$output_data/cps_met_ue_rate_jan2020_mar2020.dta", keep(1 3) nogen 
save "$output_data/main_survey_wide.dta", replace

use "$output_data/main_survey_long.dta", clear
merge m:1 metfips industry using "$output_data/cps_local_ind_ue_rate_jan2020_mar2020.dta", keep(1 3) nogen 
merge m:1 metfips using "$output_data/cps_low_skill_ue_rate_jan2020_mar2020.dta", keep(1 3) nogen 
merge m:1 metfips using "$output_data/cps_met_ue_rate_jan2020_mar2020.dta", keep(1 3) nogen 
sleep 2000
save "$output_data/main_survey_long.dta", replace


/*========================================
*** Make Reweighted Figures Data ***     
========================================*/
use "$output_data/main_survey_wide.dta", clear

*collapse WCC data to firm level
collapse (firstnm) industry sic1 company (mean) platform_tenure estab_age empsize wcc_policy shrm_best_candidate shrm_second_chance shrm_incentivized shrm_customers shrm_regulations shrm_performance months_on_platform, by(firm_id)
keep if !missing(industry) & !missing(wcc_policy) & !missing(shrm_best_candidate) & !missing(shrm_customers) & !missing(shrm_incentivized) & !missing(shrm_performance) & !missing(shrm_regulations) & !missing(shrm_second_chance) & !missing(empsize)
table industry empsize

*generate indicators for each industry and labels for them
tab industry, sort gen(ind) 
local top_n 9
forvalues ind = 1/`top_n' {
	tokenize "`: variable label ind`ind''", parse("==")
	local indlabel`ind' = "`3'"
}

*import Infogroup data
use "$output_data/infogroup_universe.dta", clear 
local n_industries = 9

*find Infogroup data's age quartiles
su estab_age, det
drop if estab_age > `r(p95)' // & !missing(estab_age)
gquantiles estab_age, _pctile percentile(50)
local IG_firmAge = string(r(r1),"%04.1f")
egen age_quartile = xtile(estab_age), n(4)
pctile age_quartile_val = estab_age, nq(4) genp(age_percent)
list age_percent age_quartile_val in 1/4

/**			
age_pe~t	age_qu~l	

1.	25	4	
2.	50	16	
3.	75	36	
4.	.	.	
**/
local age_q1 = age_quartile_val[1]
local age_q2 = age_quartile_val[2]
local age_q3 = age_quartile_val[3]
local age_q4 = 999
replace age_quartile_val = .
forvalues q = 1/4 {
	replace age_quartile_val = `age_q`q'' if age_quartile == `q' & missing(age_quartile_val)  
}

* Find Infogroup data's size quartiles
gquantiles empsize, _pctile percentile(50)
local IG_firmSize = string(r(r1),"%9.1g")
egen size_tercile=xtile(empsize), n(3) 
pctile size_tercile_val = empsize, nq(3) genp(size_percent)
list size_percent size_tercile_val in 1/3
tab size_tercile
local size_t1 = size_tercile_val[1]
local size_t2 = size_tercile_val[2]
local size_t3 = 999999

replace size_tercile_val = .
forvalues t = 1/3 {
	replace size_tercile_val = `size_t`t'' if size_tercile == `t' & missing(size_tercile_val) 
}

* Calculate Infogroup cell sizes and proportion of cell size out of total Infogroup data
gen counts = 1
egen cell_size_ind_firmsize = total(counts), by(sic size_tercile) 
egen cell_size_ind_age = total(counts), by(sic age_quartile) 
keep cell_size_ind_firmsize cell_size_ind_age sic size_tercile size_tercile_val age_quartile age_quartile_val
gen counts = 1
egen size_total = total(counts) if !missing(sic) & !missing(size_tercile)
egen age_total = total(counts) if !missing(sic) & !missing(age_quartile)
gen cell_percent = cell_size_ind_firmsize / size_total
gen cell_percent_age = cell_size_ind_age / age_total

* Rename sic and prepare it for merging by recasting it
rename sic industry 
recast str34 industry
tempfile infogroup_cells_all
save `infogroup_cells_all'

* Create data for weights based on industry and firm size
keep cell_size_ind_firmsize cell_percent industry size_tercile size_tercile_val 
duplicates drop
tempfile infogroup_cells
save `infogroup_cells'

* Create data for weights based on industry and age
use `infogroup_cells_all', clear
keep cell_size_ind_age cell_percent_age industry age_quartile age_quartile_val 
duplicates drop
tempfile infogroup_cells_age
save `infogroup_cells_age'

* Add in new firm size data
import delimited "$input_data/HSB_worker_data_firmsize.csv", encoding(UTF-8) clear

* Clean company name
gen company = subinstr(company_name,"  "," ", .)
replace company = subinstr(company,".","", .)
replace company = subinstr(company,"?","", .)
replace company = subinstr(company,"(","", .)
replace company = subinstr(company,")","", .)
replace company = subinstr(company,",","", .)
replace company = subinstr(company,"'","", .)
replace company = upper(company)

collapse (mean) firm_size, by(company)

tempfile firmsize
save `firmsize'

use "$output_data/main_survey_wide.dta", clear
*generate industry indicators
tab industry, sort gen(ind) 
*collapse by firm
collapse (firstnm) industry sic1 company ind1 ind2 ind3 ind4 ind5 ind6 ind7 ind8 ind9 (mean) platform_tenure estab_age empsize wcc_policy shrm_best_candidate shrm_second_chance shrm_incentivized shrm_customers shrm_regulations shrm_performance months_on_platform, by(firm_id)
replace company = subinstr(company,"  "," ", .)
replace company = subinstr(company,".","", .)
replace company = subinstr(company,"?","", .)
replace company = subinstr(company,"(","", .)
replace company = subinstr(company,")","", .)
replace company = subinstr(company,",","", .)
replace company = subinstr(company,"'","", .)
replace company = upper(company)

* Merge in new firm size data
merge m:1 company using `firmsize'
* 724 matches
tab _merge
drop if _merge==2

replace empsize = firm_size if missing(empsize)
keep if !missing(industry) & !missing(wcc_policy) & !missing(shrm_best_candidate) & !missing(shrm_customers) & !missing(shrm_incentivized) & !missing(shrm_performance) & !missing(shrm_regulations) & !missing(shrm_second_chance) & !missing(empsize)
* Generate age and size variables needed for merging based on Infogroup data
gen age_quartile = .
forvalues q = 1/4 {
	replace age_quartile = `q' if estab_age <= `age_q`q'' & missing(age_quartile) 
}

gen size_tercile = .
forvalues t = 1/3 {
	replace size_tercile = `t' if empsize <= `size_t`t'' & missing(size_tercile) 
}
recast str34 industry

drop _merge 
merge m:1 industry size_tercile using `infogroup_cells'
drop if _merge==2

drop _merge 
merge m:1 industry age_quartile using `infogroup_cells_age'
drop if _merge==2

gen counts = 1
egen sample_cell_size_ind_firmsize = total(counts), by(industry size_tercile) 
egen sample_cell_size_ind_age = total(counts), by(industry age_quartile) 
egen sample_total = total(counts) if !missing(industry) & !missing(size_tercile)
egen sample_total_age = total(counts) if !missing(industry) & !missing(age_quartile)
gen sample_cell_percent = sample_cell_size_ind_firmsize / sample_total
gen weight = cell_percent / sample_cell_percent
gen sample_cell_percent_age = sample_cell_size_ind_age / sample_total_age
gen weight_age = cell_percent_age / sample_cell_percent_age
gen size_t1 = size_tercile == 1
gen size_t2 = size_tercile == 2
gen size_t3 = size_tercile == 3


* SHRM marginal proportions based on SHRM.xlsx
gen wcc_p = round(wcc_policy)
gen shrm_bc = round(shrm_best_candidate)
gen shrm_sc = round(shrm_second_chance)
gen shrm_in = round(shrm_incentivized)
gen shrm_cu = round(shrm_customers)
gen shrm_re = round(shrm_regulations)
gen shrm_pe = round(shrm_performance)

* Calculate weights based on 7 SHRM policies and Infogroup industry shares
ipfweight wcc_p shrm_bc shrm_sc shrm_in shrm_cu shrm_re shrm_pe ind1 ind2 ind3 ind4 ind5 ind6 ind7 ind8 , gen(wt2) val(34 66 47 53 62 38 98 2 70 30 78 22 96 4 63.11 36.89 93.86 6.14 78.74 21.26 97.16 2.84 92.17 7.83 98.38 1.62 92.82 7.18 91.62 8.28 ) maxit(50) 

* Calculate weights based on 7 SHRM policies
ipfweight wcc_p shrm_bc shrm_sc shrm_in shrm_cu shrm_re shrm_pe, gen(wt) val(34 66 47 53 62 38 98 2 70 30 78 22 96 4) maxit(50) 

* Calculate weights based on Infogroup industry shares and firm size using iterative proportional fitting - the numbers in val are the marginal proportions from Infogroup sample (first number is percent with value 0 and second number in pair is percent with value 1) - up(5) is included so weights are trimmed to ensure no single respondent receives too much weight
ipfweight ind1 ind2 ind3 ind4 ind5 ind6 ind7 ind8 size_t1 size_t2, gen(wt3) val(63.11 36.89 93.86 6.14 78.74 21.26 97.16 2.84 92.17 7.83 98.38 1.62 92.82 7.18 91.62 8.28 49.73 50.27 76.6 23.4) maxit(50) up(5)

drop _merge

keep firm_id weight weight_age wt wt2 wt3 empsize estab_age size_t1 size_t2 size_t3 size_tercile
tempfile firm_weights
save `firm_weights'

use "$output_data/main_survey_wide.dta", clear
drop empsize estab_age
merge m:1 firm_id using `firm_weights'
* weight is Infogroup sample based weights for firms using industry and firm size using simple weighting scheme
* weight_age is Infogroup sample based weights for firms using industry and firm age using simple weighting scheme
* wt is SHRM weight based on 7 WCC policies using iterative proportional fitting (IPF)
* wt2 is weight based on 7 SHRM policies and Infogroup industries and firm sizes using IPF
* wt3 is Infogroup sample based weights for firms using industry and firm size using IPF

* Generate willingness to hire per subsidy rate - original
bysort subsidy_rate: egen m_hire = mean(hire_sub)
keep if !missing(industry) & !missing(wcc_policy) & !missing(shrm_best_candidate) & !missing(shrm_customers) & !missing(shrm_incentivized) & !missing(shrm_performance) & !missing(shrm_regulations) & !missing(shrm_second_chance) & !missing(empsize)

* Generate willingness to hire per subsidy rate - using restricted sample with nonmissing industry, 7 policies, and firm size
bysort subsidy_rate: egen m_hire_2 = mean(hire_sub)

* Generate willingness to hire per subsidy rate based on weighted by firm industry and firm size
tab weight
bysort subsidy_rate: egen double m_hire_info = total(hire_sub * weight) if !missing(weight)
bysort subsidy_rate: egen double den = total(weight) 
replace m_hire_info = m_hire_info/den //
drop den

* Generate willingness to hire per subsidy rate based on weighted by firm industry and firm age
tab weight_age
bysort subsidy_rate: egen double m_hire_info_age = total(hire_sub * weight_age) if !missing(weight_age)
bysort subsidy_rate: egen double den_age = total(weight_age) 
replace m_hire_info_age = m_hire_info_age/den_age //
drop den_age

* Generate willingness to hire per subsidy rate based on weighted by SHRM 7 WCC policies 
bysort subsidy_rate: egen double m_hire_SHRM = total(hire_sub * wt) if !missing(wt)
bysort subsidy_rate: egen double den_SHRM = total(wt) 
replace m_hire_SHRM = m_hire_SHRM/den_SHRM //
drop den_SHRM

* Generate willingness to hire per subsidy rate based on weighted by SHRM 7 WCC policies and Infogroup industry shares
bysort subsidy_rate: egen double m_hire_SHRM2 = total(hire_sub * wt2) if !missing(wt2)
bysort subsidy_rate: egen double den_SHRM2 = total(wt2) 
replace m_hire_SHRM2 = m_hire_SHRM2/den_SHRM2 //
drop den_SHRM2

* Generate willingness to hire per subsidy rate based on weighted by firm industry and firm size using IPF weight method
bysort subsidy_rate: egen double m_hire_info_ipf = total(hire_sub * wt3) if !missing(wt3)
bysort subsidy_rate: egen double den_ipf = total(wt3) 
replace m_hire_info_ipf = m_hire_info_ipf/den_ipf //
drop den_ipf

local vars "m_hire_info m_hire_info_age m_hire m_hire_SHRM m_hire_SHRM2 m_hire_2 m_hire_info_ipf"  
foreach i in `vars'{
	replace `i' = `i' * 100
	label var `i'      "Willing to Work with WCs (%)"
	bysort subsidy_rate: replace `i' = . if _n != 1 
	bysort effective_wage: gen t_`i' = round(`i')
	tostring t_`i', replace 
}

tempfile main_data
save `main_data'

use `main_data', clear
keep hire_sub subsidy_rate mgr_id firm_id
rename hire_sub hireBase
reshape long hire, i(mgr_id) j(demand_type) string 
gen constant = 1
gen weights = 1
gen reweight_shrm = 0
gen reweight_info = 0
gen subsample = 1
sort mgr_id subsidy_rate
tempfile data_weight0
save `data_weight0'

use `main_data', clear
keep hire_sub subsidy_rate mgr_id firm_id wt
rename hire_sub hireBase
reshape long hire, i(mgr_id) j(demand_type) string 
gen constant = 1
gen weights = wt
drop wt
gen reweight_shrm = 1
gen reweight_info = 0
gen subsample = 0
sort mgr_id subsidy_rate
tempfile data_weight1
save `data_weight1'

use `main_data', clear
keep hire_sub subsidy_rate mgr_id firm_id wt3
rename hire_sub hireBase
reshape long hire, i(mgr_id) j(demand_type) string 
gen constant = 1
gen weights = wt3
drop wt
gen reweight_shrm = 0
gen reweight_info = 1
gen subsample = 0
sort mgr_id subsidy_rate
tempfile data_weight3
save `data_weight3'

use "$output_data/main_survey_wide.dta", clear
keep hire_sub subsidy_rate mgr_id firm_id
rename hire_sub hireBase
reshape long hire, i(mgr_id) j(demand_type) string 
gen constant = 1
gen weights = 1
gen reweight_shrm = 0
gen reweight_info = 0
gen subsample = 0
sort mgr_id subsidy_rate
tempfile data_noweight
save `data_noweight'

append using `data_weight0'
sort mgr_id subsidy_rate
append using `data_weight1'
sort mgr_id subsidy_rate
append using `data_weight3'


save "$output_data/reweight_by_infogroup_industry_firmsize.dta", replace


/*=============================================
*** Make Platform Data merged with Infogroup ***    
==============================================*/

* Generate list of firms to fill in unmatched observations when merging by email
preserve 
use "$output_data/platform_universe.dta", clear
decode firm_id, gen(estab_name)
keep estab_name industry empsize platform_tenure state
collapse (mean) platform_tenure empsize (firstnm) industry, by(estab_name state)
sort estab_name state
duplicates tag estab_name, gen(dup)
drop if state == "" & dup >= 1
duplicates tag estab_name, gen(dup2)
drop if estab_name == ""
gsort -dup2
drop if estab_name == ""
tempfile platform_1
save `platform_1'
drop if dup2 > 0
tempfile platform_2
save `platform_2'
restore

* Merge in the industry and empsize for unmatched observations when merging by email
use "$output_data/main_survey_wide.dta", clear
keep email estab_name state
merge m:1 email using "$output_data/platform_universe.dta", keepusing(email)
keep if _merge == 1
drop _merge
merge m:1 estab_name using `platform_2', nogen keep(1 3) 
merge m:1 estab_name state using `platform_1', nogen keep(1 3)
drop dup dup2
rename estab_name estab_name_platform
rename empsize empsize_platform
rename industry industry_platform
rename state state_platform
rename platform_tenure platform_tenure_platform
gen surveyed = 1 // variable for whether the firm answered the survey 
tempfile filled 
save `filled'

* Merge survey, platform requestor, and firm covariates data by email 
use "$output_data/platform_universe.dta", clear 
decode firm_id, gen(estab_name_platform)
rename empsize empsize_platform
rename industry industry_platform
rename platform_tenure platform_tenure_platform
rename state state_platform
rename firm_id firm_id_platform
merge m:1 email using "$output_data/main_survey_wide.dta", keepusing(empsize industry estab_name platform_tenure state firm_id)
gen surveyed = 1 if _merge == 3 | _merge == 2
drop _merge
merge m:1 email using `filled', update
drop _merge
merge m:1 email using "$output_data/platform_requestor_info.dta", keep(1 3) nogen 
drop estab_name 
merge m:1 email using "$input_data/firm_covariates.dta", keepusing(estab_name) keep(3) nogen 
replace estab_name = estab_name_platform if estab_name_platform != "" & estab_name == ""

* Here we combine the firm_id's from the Platform data to the survey data so that respondents from the same surveyed firms share the same Platform firm id
sum firm_id_platform
replace firm_id_platform = firm_id + r(max) if firm_id != . & firm_id_platform == . 
bysort firm_id (email): egen firm_id_platform2 = mode(firm_id_platform) if surveyed == 1, maxmode
replace firm_id_platform = firm_id_platform2 if firm_id_platform2 != .
distinct firm_id_platform if surveyed == 1

* Make observations with the same firm id have the same estab_name
bysort firm_id_platform (email): egen estab_name2 = mode(estab_name) if firm_id_platform != ., maxmode
replace estab_name = estab_name2 if firm_id_platform != .

* Fill in missing state using state variables from platform_universe and survey data 
replace state_platform = state if state_platform == "" & state != ""
replace state_platform = state_code if state_platform == "" & state_code != ""
replace state_platform = "TX" if state_platform == "Texas"

* Make observations with same firm id have same state if it state is missing 
bysort firm_id_platform (email): egen state2 = mode(state_platform), maxmode
replace state_platform = state2 if state_platform == "" & state2 != "" & firm_id_platform != .

label var estab_name "Requestor Establishment (Raw)"

** Standardize names with NBER scripts 
gen standard_name = trim(estab_name)
replace standard_name=upper(standard_name)
replace standard_name = subinstr(standard_name,"-ADR"," ",30)
replace standard_name = subinstr(standard_name,"-ADS"," ",30)
replace standard_name = subinstr(standard_name,"-CL A "," ",30)
replace standard_name = subinstr(standard_name,"-CL B "," ",30)
replace standard_name = subinstr(standard_name,"-CONN "," ",30)
replace standard_name = subinstr(standard_name,"-CONSOLIDATED "," ",30)
replace standard_name = subinstr(standard_name,"-DEL "," ",30)
replace standard_name = subinstr(standard_name,"-DE "," ",30)
replace standard_name = subinstr(standard_name,"-NY SHARES "," ",30)
replace standard_name = subinstr(standard_name,"-OLD "," ",30)
replace standard_name = subinstr(standard_name,"-ORD "," ",30)
replace standard_name = subinstr(standard_name,"-PRE AMEND "," ",30)      /* JB */
replace standard_name = subinstr(standard_name,"-PRE DIVEST "," ",30)     /* JB */
replace standard_name = subinstr(standard_name,"-PREAMEND "," ",30)       /* JB */
replace standard_name = subinstr(standard_name,"-PREDIVEST "," ",30)      /* JB */
replace standard_name = subinstr(standard_name,"-PROJ "," ",30)       /* JB */
replace standard_name = subinstr(standard_name,"-PROJECTED "," ",30)      /* JB */
replace standard_name = subinstr(standard_name,"-PREF "," ",30)       /* JB */
replace standard_name = subinstr(standard_name,"-PRE FASB "," ",30)       /* JB */
replace standard_name = subinstr(standard_name,"-PREFASB "," ",30)        /* JB */
replace standard_name = subinstr(standard_name,"-PRO FORMA "," ",30)
replace standard_name = subinstr(standard_name,"- PRO FORMA "," ",30)
replace standard_name = subinstr(standard_name,"-PRO FORMA1 "," ",30)
replace standard_name = subinstr(standard_name,"-PRO FORMA2 "," ",30)
replace standard_name = subinstr(standard_name,"-PRO FORMA3 "," ",30)
replace standard_name = subinstr(standard_name,"-REDH "," ",30)
replace standard_name = subinstr(standard_name,"-SER A COM "," ",30)
replace standard_name = subinstr(standard_name,"-SER A "," ",30)
replace standard_name = subinstr(standard_name,"-SPN "," ",30)

replace standard_name = subinstr(standard_name," ACCPTNCE "," ACCEPTANCE ",30)
replace standard_name = subinstr(standard_name," BANCORPORATION "," BANCORP ",30)
replace standard_name = subinstr(standard_name," BANCORPORTN "," BANCORP ",30)
replace standard_name = subinstr(standard_name," BANCRP "," BANCORP ",30)
replace standard_name = subinstr(standard_name," BNCSHRS "," BANCSHARES ",30)
replace standard_name = subinstr(standard_name," BRWG "," BREWING ",30)
replace standard_name = subinstr(standard_name," CHEVRONTEXACO "," CHEVRON TEXACO ",30)
replace standard_name = subinstr(standard_name," CHSE "," CHASE ",30)
replace standard_name = subinstr(standard_name," COMMN "," COMMUNICATION ",30)
replace standard_name = subinstr(standard_name," COMMUN "," COMMUNICATION ",30)
replace standard_name = subinstr(standard_name," COMMUNICATNS "," COMMUNICATION ",30)
replace standard_name = subinstr(standard_name," COMMUNICATIONS "," COMMUNICATION ",30)
replace standard_name = subinstr(standard_name," DPT STS "," DEPT STORES ",30)
replace standard_name = subinstr(standard_name," DPT "," DEPT ",30)
replace standard_name = subinstr(standard_name," ENRGY "," ENERGY ",30)
replace standard_name = subinstr(standard_name," FINL "," FINANCIAL ",30)
replace standard_name = subinstr(standard_name," FNCL "," FINANCIAL ",30)
replace standard_name = subinstr(standard_name," GRP "," GROUP ",30)
replace standard_name = subinstr(standard_name," HLDGS "," HOLDINGS ",30)  
replace standard_name = subinstr(standard_name," HLDG "," HOLDING ",30)
replace standard_name = subinstr(standard_name," HLT NTWK "," HEALTH NETWORK ",30)
replace standard_name = subinstr(standard_name," HTLS RES "," HOTELS & RESORTS ",30)
replace standard_name = subinstr(standard_name," HLTH "," HEALTH ",30)
replace standard_name = subinstr(standard_name," INTRTECHNLGY "," INTERTECHNOLOGY ",30)
replace standard_name = subinstr(standard_name," JPMORGAN "," J P MORGAN ",30)
replace standard_name = subinstr(standard_name," MED OPTIC "," MEDICAL OPTICS ",30)
replace standard_name = subinstr(standard_name," MINNESOTA MINING AND MANUFACTURING COMPANY "," 3M COMPANY ",30)
replace standard_name = subinstr(standard_name," NAT RES "," NATURAL RESOURCES ",30)
replace standard_name = subinstr(standard_name," NETWRKS "," NETWORK ",30)
replace standard_name = subinstr(standard_name," PHARMACTICALS "," PHARM ",30)
replace standard_name = subinstr(standard_name," PHARMACT "," PHARM ",30)
replace standard_name = subinstr(standard_name," PPTYS TST "," PROPERTIES TRUST ",30)
replace standard_name = subinstr(standard_name," PPTY "," PROPERTY ",30)
replace standard_name = subinstr(standard_name," PROPERTY TR "," PROPERTY TRUST ",30)
replace standard_name = subinstr(standard_name," PAC RAILWY "," PACIFIC RAILWAY ",30)
replace standard_name = subinstr(standard_name," SEMICONDTR "," SEMICONDUCTOR ",30)
replace standard_name = subinstr(standard_name," SOLU "," SOLUTIONS ",30)
replace standard_name = subinstr(standard_name," ST & ALMN "," STEEL & ALUMINUM ",30)
replace standard_name = subinstr(standard_name," STD "," STANDARD ",30)
replace standard_name = subinstr(standard_name," TECHNOLGS "," TECH ",30)
replace standard_name = subinstr(standard_name," TECHNOL "," TECH ",30)
replace standard_name = subinstr(standard_name," TRANSPORTATN "," TRANSPORTATION ",30)


replace standard_name = subinstr(standard_name," ADVERTSG "," ADVERTISING ",30)  /* JB */
replace standard_name = subinstr(standard_name," ADVNTGE "," ADVANTAGE ",30)     /* JB */
replace standard_name = subinstr(standard_name," AIRLN "," AIRLINES ",30)    /* JB */
replace standard_name = subinstr(standard_name," AIRLS "," AIRLINES ",30)    /* JB */
replace standard_name = subinstr(standard_name," AM "," AMERICA ",30)    /* JB */
replace standard_name = subinstr(standard_name," AMER "," AMERICAN ",30)     /* JB */
replace standard_name = subinstr(standard_name," APPLIAN "," APPLIANCES ",30)    /* JB */
replace standard_name = subinstr(standard_name," APPLICTN "," APPLICATIONS ",30)     /* JB */
replace standard_name = subinstr(standard_name," ARCHTCTS "," ARCHITECTS ",30)   /* JB */
replace standard_name = subinstr(standard_name," ASSD "," ASSOCIATED ",30)   /* JB */
replace standard_name = subinstr(standard_name," ASSOC "," ASSOCIATES ",30)  /* JB */
replace standard_name = subinstr(standard_name," ASSOCS "," ASSOCIATES ",30)     /* JB */
replace standard_name = subinstr(standard_name," ATOMC "," ATOMIC ",30)  /* JB */
replace standard_name = subinstr(standard_name," BANCSH "," BANCSHARES ",30)     /* JB */
replace standard_name = subinstr(standard_name," BANCSHR "," BANCSHARES ",30)    /* JB */
replace standard_name = subinstr(standard_name," BCSHS "," BANCSHARES ",30)  /* JB */
replace standard_name = subinstr(standard_name," BK "," BANK ",30)   /* JB */
replace standard_name = subinstr(standard_name," BLDGS "," BUILDINGS ",30)   /* JB */
replace standard_name = subinstr(standard_name," BROADCASTG "," BROADCASTING ",30)   /* JB */
replace standard_name = subinstr(standard_name," BTLNG "," BOTTLING ",30)    /* JB */
replace standard_name = subinstr(standard_name," CBLVISION "," CABLEVISION ",30)     /* JB */
replace standard_name = subinstr(standard_name," CENTRS "," CENTERS ",30)    /* JB */
replace standard_name = subinstr(standard_name," CHAMPNSHIP "," CHAMPIONSHIP ",30)   /* JB */
replace standard_name = subinstr(standard_name," CMMNCTNS "," COMMUNICATIONS ",30)   /* JB */
replace standard_name = subinstr(standard_name," CNVRSION "," CONVERSION ",30)   /* JB */
replace standard_name = subinstr(standard_name," COFF "," COFFEE ",30)   /* JB */
replace standard_name = subinstr(standard_name," COMM "," COMMUNICATIONS ",30)   /* JB */
replace standard_name = subinstr(standard_name," COMMUN "," COMMUNICATIONS ",30)     /* JB */
replace standard_name = subinstr(standard_name," COMMUNCTN "," COMMUNICATIONS ",30)  /* JB */
replace standard_name = subinstr(standard_name," COMMUNICTNS "," COMMUNICATIONS ",30)    /* JB */
replace standard_name = subinstr(standard_name," COMP "," COMPUTERS ",30)    /* JB */
replace standard_name = subinstr(standard_name," COMPUTR "," COMPUTER ",30)  /* JB */
replace standard_name = subinstr(standard_name," CONFERENCG "," CONFERENCING ",30)   /* JB */
replace standard_name = subinstr(standard_name," CONSTRN "," CONSTR ",30)    /* JB */
replace standard_name = subinstr(standard_name," CONTL "," CONTINENTAL ",30)     /* JB */
replace standard_name = subinstr(standard_name," CONTNT "," CONTINENTAL ",30)    /* JB */
replace standard_name = subinstr(standard_name," CONTRL "," CONTROL ",30)    /* JB */
replace standard_name = subinstr(standard_name," CONTRL "," CONTROL ",30)    /* JB */
replace standard_name = subinstr(standard_name," CTR "," CENTER ",30)    /* JB */
replace standard_name = subinstr(standard_name," CTRS "," CENTERS ",30)  /* JB */
replace standard_name = subinstr(standard_name," CVRGS "," COVERINGS ",30)   /* JB */
replace standard_name = subinstr(standard_name," DEV "," DEVELOPMENT ",30)   /* JB */
replace standard_name = subinstr(standard_name," DEVL "," DEVELOPMENT ",30)  /* JB */
replace standard_name = subinstr(standard_name," DEVLP "," DEVELOPMENT ",30)     /* JB */
replace standard_name = subinstr(standard_name," DISTR "," DISTRIBUTION ",30)    /* JB */
replace standard_name = subinstr(standard_name," DISTRIBUT "," DISTRIBUTION ",30)    /* JB */
replace standard_name = subinstr(standard_name," DISTRIBUTN "," DISTRIBUTION ",30)   /* JB */
replace standard_name = subinstr(standard_name," ELCTRNCS "," ELECTRONICS ",30)  /* JB */
replace standard_name = subinstr(standard_name," ELECTR "," ELECTRONICS ",30)    /* JB */
replace standard_name = subinstr(standard_name," ENGNRD "," ENGINEERED ",30)     /* JB */
replace standard_name = subinstr(standard_name," ENMT "," ENTERTAINMENT ",30)    /* JB */
replace standard_name = subinstr(standard_name," ENTERTAIN "," ENTERTAINMENT ",30)   /* JB */
replace standard_name = subinstr(standard_name," ENTERTNMNT "," ENTERTAINMENT ",30)  /* JB */
replace standard_name = subinstr(standard_name," ENTMNT "," ENTERTAINMENT ",30)  /* JB */
replace standard_name = subinstr(standard_name," ENTMT "," ENTERTAINMENT ",30)   /* JB */
replace standard_name = subinstr(standard_name," ENTRPR "," ENTERPRISES ",30)    /* JB */
replace standard_name = subinstr(standard_name," ENTRPRISE "," ENTERPRISES ",30)     /* JB */
replace standard_name = subinstr(standard_name," ENTRPRS "," ENTERPRISES ",30)   /* JB */
replace standard_name = subinstr(standard_name," ENVIR "," ENVIRONMENTAL ",30)   /* JB */
replace standard_name = subinstr(standard_name," ENVIRNMNTL "," ENVIRONMENTAL ",30)  /* JB */
replace standard_name = subinstr(standard_name," ENVR "," ENVIRONMENTAL ",30)    /* JB */
replace standard_name = subinstr(standard_name," EQUIPMT "," EQUIPMENT ",30)     /* JB */
replace standard_name = subinstr(standard_name," EXCHG "," EXCHANGE ",30)    /* JB */
replace standard_name = subinstr(standard_name," EXPLOR "," EXPLORATION ",30)    /* JB */
replace standard_name = subinstr(standard_name," FNDG "," FUNDING ",30)  /* JB */
replace standard_name = subinstr(standard_name," GLD "," GOLD ",30)  /* JB */
replace standard_name = subinstr(standard_name," GP "," GROUP ",30)  /* JB */
replace standard_name = subinstr(standard_name," HLDS "," HLDGS ",30)    /* JB */
replace standard_name = subinstr(standard_name," HLTHCARE "," HEALTHCARE ",30)   /* JB */
replace standard_name = subinstr(standard_name," HLTHCR "," HEALTHCARE ",30)     /* JB */
replace standard_name = subinstr(standard_name," HOMEMDE "," HOMEMADE ",30)  /* JB */
replace standard_name = subinstr(standard_name," HSPTL "," HOSPITAL ",30)    /* JB */
replace standard_name = subinstr(standard_name," ILLUM "," ILLUMINATION ",30)    /* JB */
replace standard_name = subinstr(standard_name," INDL "," INDUSTRIAL ",30)   /* JB */
replace standard_name = subinstr(standard_name," INDPT "," INDEPENDENT ",30)     /* JB */
replace standard_name = subinstr(standard_name," INDTY "," INDEMNITY ",30)   /* JB */
replace standard_name = subinstr(standard_name," INFORMATN "," INFO ",30)    /* JB */
replace standard_name = subinstr(standard_name," INSTNS "," INSTITUTIONS ",30)   /* JB */
replace standard_name = subinstr(standard_name," INSTRUMEN "," INSTRUMENTS ",30)     /* JB */
replace standard_name = subinstr(standard_name," INSTRUMNT "," INSTRUMENTS ",30)     /* JB */
replace standard_name = subinstr(standard_name," INTEGRATRS "," INTEGRATORS ",30)    /* JB */
replace standard_name = subinstr(standard_name," INTERNATIONL "," INT ",30)  /* JB */
replace standard_name = subinstr(standard_name," INVS "," INVESTMENTS ",30)  /* JB */
replace standard_name = subinstr(standard_name," INVT "," INVESTMENT ",30)   /* JB */
replace standard_name = subinstr(standard_name," MANAGEMNT "," MANAGEMENT ",30)  /* JB */
replace standard_name = subinstr(standard_name," MANAGMNT "," MANAGEMENT ",30)   /* JB */
replace standard_name = subinstr(standard_name," MANHATN "," MANHATTAN ",30)     /* JB */
replace standard_name = subinstr(standard_name," MANUF "," MFG ",30)     /* JB */
replace standard_name = subinstr(standard_name," MDSE "," MERCHANDISING ",30)    /* JB */
replace standard_name = subinstr(standard_name," MEASURMNT "," MEASUREMENT ",30)     /* JB */
replace standard_name = subinstr(standard_name," MERCHNDSNG "," MERCHANDISING ",30)  /* JB */
replace standard_name = subinstr(standard_name," MGMT "," MANAGEMENT ",30)   /* JB */
replace standard_name = subinstr(standard_name," MGRS "," MANAGERS ",30)     /* JB */
replace standard_name = subinstr(standard_name," MGT "," MANAGEMENT ",30)    /* JB */
replace standard_name = subinstr(standard_name," MICROWAV "," MICROWAVE ",30)    /* JB */
replace standard_name = subinstr(standard_name," MKTS "," MARKETS ",30)  /* JB */
replace standard_name = subinstr(standard_name," MLTIMEDIA "," MULTIMEDIA ",30)  /* JB */
replace standard_name = subinstr(standard_name," MTG "," MORTGAGE ",30)  /* JB */
replace standard_name = subinstr(standard_name," MTNS "," MOUTAINS ",30)     /* JB */
replace standard_name = subinstr(standard_name," MTRS "," MOTORS ",30)   /* JB */
replace standard_name = subinstr(standard_name," NETWRK "," NETWORK ",30)    /* JB */
replace standard_name = subinstr(standard_name," NOWEST "," NORTHWEST ",30)  /* JB */
replace standard_name = subinstr(standard_name," NTWRK "," NETWORK ",30)     /* JB */
replace standard_name = subinstr(standard_name," OFFSHRE "," OFFSHORE ",30)  /* JB */
replace standard_name = subinstr(standard_name," ORGANIZTN "," ORG ",30)     /* JB */
replace standard_name = subinstr(standard_name," PBLG "," PUBLISHING ",30)   /* JB */
replace standard_name = subinstr(standard_name," PHARMACEUTICL "," PHARM ",30)   /* JB */
replace standard_name = subinstr(standard_name," PLAST "," PLASTICS ",30)    /* JB */
replace standard_name = subinstr(standard_name," PPTYS "," PROPERTIES ",30)  /* JB */
replace standard_name = subinstr(standard_name," PRODS "," PROD ",30)    /* JB */
replace standard_name = subinstr(standard_name," PRODTN "," PRODN ",30)  /* JB */
replace standard_name = subinstr(standard_name," PRODUCTN "," PRODN ",30)    /* JB */
replace standard_name = subinstr(standard_name," PRPANE "," PROPANE ",30)    /* JB */
replace standard_name = subinstr(standard_name," PTS "," PARTS ",30)     /* JB */
replace standard_name = subinstr(standard_name," PUBLISH "," PUBLISHING ",30)    /* JB */
replace standard_name = subinstr(standard_name," PUBLSHING "," PUBLISHING ",30)  /* JB */
replace standard_name = subinstr(standard_name," PUBN "," PUBLICATIONS ",30)     /* JB */
replace standard_name = subinstr(standard_name," PUBNS "," PUBLICATIONS ",30)    /* JB */
replace standard_name = subinstr(standard_name," PWR "," POWER ",30)     /* JB */
replace standard_name = subinstr(standard_name," RAILRD "," RAILROAD ",30)   /* JB */
replace standard_name = subinstr(standard_name," RECREATN "," RECREATION ",30)   /* JB */
replace standard_name = subinstr(standard_name," RECYCL "," RECYCLING ",30)  /* JB */
replace standard_name = subinstr(standard_name," REFIN "," REFINING ",30)    /* JB */
replace standard_name = subinstr(standard_name," REFNG "," REFINING ",30)    /* JB */
replace standard_name = subinstr(standard_name," RESTR "," RESTAURANT ",30)  /* JB */
replace standard_name = subinstr(standard_name," RESTS "," RESTAURANTS ",30)     /* JB */
replace standard_name = subinstr(standard_name," RETAILNG "," RETAILING ",30)    /* JB */
replace standard_name = subinstr(standard_name," RLTY "," REALTY ",30)   /* JB */
replace standard_name = subinstr(standard_name," RR "," RAILROAD ",30)   /* JB */
replace standard_name = subinstr(standard_name," RSCH "," RESEARCH ",30)     /* JB */
replace standard_name = subinstr(standard_name," RTNG "," RATING ",30)   /* JB */
replace standard_name = subinstr(standard_name," SCIENTIF "," SCIENTIFIC ",30)   /* JB */
replace standard_name = subinstr(standard_name," SERV "," SERVICES ",30)     /* JB */
replace standard_name = subinstr(standard_name," SLTNS "," SOLUTIONS ",30)   /* JB */
replace standard_name = subinstr(standard_name," SOFTWRE "," SOFTWARE ",30)  /* JB */
replace standard_name = subinstr(standard_name," SOLTNS "," SOLUTIONS ",30)  /* JB */
replace standard_name = subinstr(standard_name," SOLUT "," SOLUTIONS ",30)   /* JB */
replace standard_name = subinstr(standard_name," SRVC "," SERVICES ",30)     /* JB */
replace standard_name = subinstr(standard_name," SRVCS "," SERVICES ",30)    /* JB */
replace standard_name = subinstr(standard_name," STEAKHSE "," STEAKHOUSE ",30)   /* JB */
replace standard_name = subinstr(standard_name," STHWST "," SOUTHWEST ",30)  /* JB */
replace standard_name = subinstr(standard_name," STL "," STEEL ",30)     /* JB */
replace standard_name = subinstr(standard_name," STRS "," STORES ",30)   /* JB */
replace standard_name = subinstr(standard_name," SUP "," SUPPLY ",30)    /* JB */
replace standard_name = subinstr(standard_name," SUPERMKTS "," SUPERMARKETS ",30)    /* JB */
replace standard_name = subinstr(standard_name," SUPP "," SUPPLIES ",30)     /* JB */
replace standard_name = subinstr(standard_name," SURVYS "," SURVEYS ",30)    /* JB */
replace standard_name = subinstr(standard_name," SVC "," SERVICES ",30)  /* JB */
replace standard_name = subinstr(standard_name," SVCS "," SERVICES ",30)     /* JB */
replace standard_name = subinstr(standard_name," SVSC "," SERVICES ",30)     /* JB */
replace standard_name = subinstr(standard_name," SYS "," SYSTEMS ",30)   /* JB */
replace standard_name = subinstr(standard_name," SYSTM "," SYSTEMS ",30)     /* JB */
replace standard_name = subinstr(standard_name," TCHNLGY "," TECH ",30)  /* JB */
replace standard_name = subinstr(standard_name," TECHNGS "," TECHNOLOGIES ",30)  /* JB */
replace standard_name = subinstr(standard_name," TECHNL "," TECH ",30)   /* JB */
replace standard_name = subinstr(standard_name," TECHNLGIES "," TECHNOLOGIES ",30)   /* JB */
replace standard_name = subinstr(standard_name," TEL "," TELEPHONE ",30)     /* JB */
replace standard_name = subinstr(standard_name," TELE-COMM "," TELECOMMUNICATIONS ",30)  /* JB */
replace standard_name = subinstr(standard_name," TELE-COMMUN "," TELECOMMUNICATIONS ",30)    /* JB */
replace standard_name = subinstr(standard_name," TELECOMMS "," TELECOMMUNICATIONS ",30)  /* JB */
replace standard_name = subinstr(standard_name," TELECONFERENC "," TELECONFERENCING ",30)    /* JB */
replace standard_name = subinstr(standard_name," TELEG "," TELEGRAPH ",30)   /* JB */
replace standard_name = subinstr(standard_name," TELEGR "," TELEGRAPH ",30)  /* JB */
replace standard_name = subinstr(standard_name," TELVSN "," TELEVISION ",30)     /* JB */
replace standard_name = subinstr(standard_name," TR "," TRUST ",30)  /* JB */
replace standard_name = subinstr(standard_name," TRANSN "," TRANSPORTATION ",30)     /* JB */
replace standard_name = subinstr(standard_name," TRANSPORTN "," TRANSPORTATION ",30)     /* JB */
replace standard_name = subinstr(standard_name," TRNSACTN "," TRANSACTION ",30)  /* JB */
replace standard_name = subinstr(standard_name," UTD "," UNITED ",30)    /* JB */
replace standard_name = subinstr(standard_name," WSTN "," WESTERN ",30)  /* JB */
replace standard_name = subinstr(standard_name," WTR "," WATER ",30)     /* JB */
replace standard_name=" U.S. PHILIPS CORPORATION " if trim(standard_name)=="NORTH AMERICAN PHILIPS CORP"
replace standard_name=" A. L. WILLIAMS CORP. " if trim(standard_name)=="WILLIAMS (A.L.) CORP"
replace standard_name=" B. F. GOODRICH CO. " if trim(standard_name)=="GOODRICH CORP"
replace standard_name=" BELL + HOWELL COMPANY " if trim(standard_name)=="BELL & HOWELL OPERATING CO"
replace standard_name=" BENDIX CORPORATION(NOW ALLIED-SIGNAL INC.) " if trim(standard_name)=="BENDIX CORP"
replace standard_name=" BORG-WARNER CORPORATION " if trim(standard_name)=="BORGWARNER INC"
replace standard_name=" CHRYSLER MOTORS CORPORATION " if trim(standard_name)=="CHRYSLER CORP"
replace standard_name=" CISCO TECHNOLOGY, INC. " if trim(standard_name)=="CISCO SYSTEMS INC"
replace standard_name=" DELL PRODUCTS, L.P. " if trim(standard_name)=="DELL INC"
replace standard_name=" DELPHI TECHNOLOGIES, INC. " if trim(standard_name)=="DELPHI CORP"
replace standard_name=" E. I. DU PONT DE NEMOURS AND COMPANY " if trim(standard_name)=="DU PONT (E I) DE NEMOURS"
replace standard_name=" E. R. SQUIBB + SONS, INC. " if trim(standard_name)=="SQUIBB CORP"
replace standard_name=" ELI LILLY AND COMPANY " if trim(standard_name)=="LILLY (ELI) & CO"
replace standard_name=" G. D. SEARLE & CO. " if trim(standard_name)=="SEARLE (G.D.) & CO"
replace standard_name=" MINNESOTA MINING AND MANUFACTURING COMPANY " if trim(standard_name)=="3M CO"
replace standard_name=" OWENS-CORNING FIBERGLAS CORPORATION " if trim(standard_name)=="OWENS CORNING"
replace standard_name=" SCHLUMBERGER TECHNOLOGY CORPORATION " if trim(standard_name)=="SCHLUMBERGER LTD"
replace standard_name=" SCI-MED LIFE SYSTEMS, INC. " if trim(standard_name)=="SICMED LIFE SYSTEMS"
replace standard_name=" TDK CORPORATION " if trim(standard_name)=="TDK CORP"
replace standard_name=" UNITED STATES SURGICAL CORPORATION " if trim(standard_name)=="U S SURGICAL CORP"
replace standard_name=" W. R. GRACE & CO. " if trim(standard_name)=="GRACE (W R) & CO"
replace standard_name=" WESTINGHOUSE ELECTRIC CORP. " if trim(standard_name)=="WESTINGHOUSE ELEC"

** EPO Espace specific character format problems
** For files downloaded from EPO Espace & appears as &amp; 
** Also recode all common words for "AND" to &
tempvar len
gen `len' =length(standard_name)

replace standard_name = subinstr( standard_name, "&AMP;", " & ", 5)
replace standard_name = subinstr( standard_name, "+", " & ", 5)
replace standard_name = subinstr( standard_name, " AND ", " & ", 5)
replace standard_name = subinstr( standard_name, " ET ", " & ", 5)
replace standard_name = subinstr( standard_name, " UND ", " & ", 5)
replace standard_name = subinstr( standard_name, "&", " & ",30)     /* BHH - ensure that & is separate word */

** British - specific problem with names that end in (THE) and names that start with
** THE, so remove these
replace standard_name=substr(standard_name, 1, `len'-5) if substr(standard_name, -5, 5)=="(THE)"
replace standard_name=substr(standard_name, 5, .) if substr(standard_name, 1, 4)=="THE "

** strip punctuation 
replace standard_name = subinstr( standard_name, "'",  "", 30)
replace standard_name = subinstr( standard_name, ";",  "", 30) 
replace standard_name = subinstr( standard_name, "^",  "", 30)
replace standard_name = subinstr( standard_name, "<",  "", 30)
replace standard_name = subinstr( standard_name, ".",  "", 30)
replace standard_name = subinstr( standard_name, "`",  "", 30)
replace standard_name = subinstr( standard_name, "_",  "", 30)
replace standard_name = subinstr( standard_name, ">",  "", 30)
replace standard_name = subinstr( standard_name, "''", "", 30)
replace standard_name = subinstr( standard_name, "!",  "", 30)
replace standard_name = subinstr( standard_name, "+",  "", 30)
replace standard_name = subinstr( standard_name, "?",  "", 30)
replace standard_name = subinstr( standard_name, "(",  "", 30)
replace standard_name = subinstr( standard_name, "£",  "", 30)
replace standard_name = subinstr( standard_name, "{",  "", 30)
replace standard_name = subinstr( standard_name, "\",  "", 30)
replace standard_name = subinstr( standard_name, ")",  "", 30)
replace standard_name = subinstr( standard_name, "$",  "", 30)
replace standard_name = subinstr( standard_name, "}",  "", 30)
replace standard_name = subinstr( standard_name, "|",  "", 30)
replace standard_name = subinstr( standard_name, ",",  "", 30)
replace standard_name = subinstr( standard_name, "%",  "", 30)
replace standard_name = subinstr( standard_name, "[",  "", 30)
replace standard_name = subinstr( standard_name, "¦",  "", 30)
replace standard_name = subinstr( standard_name, "*",  "", 30)
replace standard_name = subinstr( standard_name, "]",  "", 30)
replace standard_name = subinstr( standard_name, "/",  " ", 30) 
replace standard_name = subinstr( standard_name, "@",  "", 30)
replace standard_name = subinstr( standard_name, ":",  "", 30)
replace standard_name = subinstr( standard_name, "~",  "", 30)
replace standard_name = subinstr( standard_name, "#",  "", 30)
replace standard_name = subinstr( standard_name, "-",  " ", 30) 
replace standard_name = subinstr( standard_name, "  ", " ", 30) 


*** Clean common corporate type identifiers 
** 2) Perform some additional changes
replace standard_name = subinstr( standard_name, " RES & DEV ", " R&D ", 1)
replace standard_name = subinstr( standard_name, " RECH & DEV ", " R&D ", 1)

** 3) Perform some country specific work
** UNITED STATES (most of this is in Derwent)

** UNITED KINGDOM
replace standard_name = subinstr( standard_name, " PUBLIC LIMITED ", " PLC ", 1)
replace standard_name = subinstr( standard_name, " PUBLIC LIABILITY COMPANY ", " PLC ", 1)
replace standard_name = subinstr( standard_name, " HOLDINGS ", " HLDGS ", 1)
replace standard_name = subinstr( standard_name, " HOLDING ", " HLDGS ", 1)
replace standard_name = subinstr( standard_name, " GREAT BRITAIN ", " GB ", 1)
replace standard_name = subinstr( standard_name, " LTD CO ", " CO LTD ", 1)

** SPANISH
replace standard_name = subinstr( standard_name, " SOC LIMITADA ", " SL ", 1)
replace standard_name = subinstr( standard_name, " SOC EN COMMANDITA ", " SC ", 1)
replace standard_name = subinstr( standard_name, " & CIA ", " CO ", 1)

rename standard_name company
label var company "Requestor Establishment (Cleaned, for matching)"
replace empsize_platform = empsize if empsize_platform == . & empsize != .
replace industry_platform = industry if industry_platform == "" & industry != ""
replace platform_tenure_platform = platform_tenure if platform_tenure_platform == . & platform_tenure != .
gen num_months_post_2019 = (jan2019 > 0 & !missing(jan2019)) + (feb2019 > 0 & !missing(feb2019)) + (mar2019 > 0 & !missing(mar2019)) + (apr2019 > 0 & !missing(apr2019)) + (may2019 > 0 & !missing(may2019)) + (jun2019 > 0 & !missing(jun2019)) + (jul2019 > 0 & !missing(jul2019)) + (aug2019 > 0 & !missing(aug2019)) + (sep2019 > 0 & !missing(sep2019)) + (oct2019 > 0 & !missing(oct2019)) + (nov2019 > 0 & !missing(nov2019)) + (dec2019 > 0 & !missing(dec2019))
replace num_months_post_2019 = . if missing(jan2019) | missing(feb2019) | missing(mar2019) | missing(apr2019) | missing(may2019) | missing(jun2019) | missing(jul2019) | missing(aug2019) | missing(sep2019) | missing(oct2019) | missing(nov2019) | missing(dec2019)
gen multiday = multiday_count > 0 & !missing(multiday)

tempfile platform_emails
save `platform_emails'

keep industry_platform empsize_platform surveyed company platform_tenure_platform vacant_job modal_category state_platform num_months_post_2019 multiday estab_name firm_id_platform n_jobs

drop if company == "" 
replace platform_tenure_platform = . if platform_tenure_platform < 0
collapse (mean) empsize_platform platform_tenure_platform vacant_job num_months_post_2019 n_jobs (firstnm) estab_name industry_platform surveyed modal_category multiday firm_id_platform, by(company state_platform)

replace surveyed = 0 if surveyed != 1
rename empsize_platform empsize
rename platform_tenure_platform platform_tenure
rename industry_platform industry
rename state_platform state_abbrev

* Merge in infogroup data that was merged with the survey data
merge m:1 estab_name state_abbrev using "$output_data/infogroup_data.dta", nogen keep(1 3)

preserve 
import excel "$input_data/merge_manual_review.xlsx", clear firstrow
duplicates drop

tempfile match_approved
save `match_approved', replace
restore 

merge m:1 standard_name company using `match_approved', nogen keep(1 3)

gen company_raw = company
foreach var in sic sic_desc estab_year emp_size_bin {
	cap: replace `var'  = "" if match_approved != 1 
	cap: replace `var'  = . if match_approved != 1 
	assert missing(`var') if match_approved != 1 
}

rename empsize empsize_1
rename sic sic_1

* Merge in the infogroup universe data by cleaned company name to bring in data for firms that did not answer the survey
merge m:m company using "$output_data/infogroup_universe.dta"
drop if _merge == 2

* Collapse by company name and state abbreviation
collapse (mean) empsize empsize_1 platform_tenure vacant_job num_months_post_2019 estab_year estab_age n_jobs (firstnm) estab_name industry surveyed modal_category multiday sic_1 sic firm_id_platform, by(company state_abbrev)

* Drop duplicates with missing state
bysort company: egen missing_states = total(missing(state_abbrev))
duplicates tag company, gen(duplicate_names)
drop if duplicate_names > 0 & missing_states > 0 & missing(state_abbrev) & (missing_states <= duplicate_names )
drop duplicate_names missing_states

* Fill in missing information using data from infogroup_data.dta
replace empsize = empsize_1 if missing(empsize)
sum estab_age if surveyed == 1
gen estab_age_old = estab_age
replace estab_age = 2020 - estab_year if estab_age == .
replace industry = sic if industry == ""

* Use mode or mean response for companies with same firm_id
foreach var of varlist empsize platform_tenure vacant_job num_months_post_2019 estab_year estab_age n_jobs  {
	bysort firm_id_platform: egen mean_`var' = mean(`var')
	replace `var' = mean_`var' if !missing(firm_id_platform)
	
}

foreach var of varlist estab_name industry surveyed modal_category multiday sic_1 sic firm_id_platform state_abbrev company {
	bysort firm_id_platform: egen mode_`var' = mode(`var'), maxmode
	replace `var' = mode_`var' if !missing(firm_id_platform)
	
}

duplicates drop firm_id_platform if !missing(firm_id_platform), force 

save "$output_data/platform_infogroup.dta", replace

log close
exit 
